Importing Pandas

In [107]:
import pandas as pd

Converting bua into a csv file

In [108]:
#reading the XLS file
bua_xls = pd.read_excel("initial_data/bua.xls")

#saving it as a csv file
bua_xls.to_csv("fixed_csv_files/bua_converted.csv", index=False, encoding="utf-8")
bua = pd.read_csv("fixed_csv_files/bua_converted.csv")

Adding a header to buf

In [109]:
#creating the headers
headers = ["Cote", "Titre", "Auteur", "Lieu", "Edition", "Annee", "Nb pages", "Matiere", "Inventaire"]

#loading CSV without header
buf = pd.read_csv("initial_data/buf.csv", names=headers, encoding="ISO-8859-1", delimiter=";")

#saving the corrected CSV
buf.to_csv("fixed_csv_files/fixed_buf.csv", index=False, encoding="utf-8")

removing duplicates

In [110]:
buf = buf.drop_duplicates()
bua = bua.drop_duplicates()

#removing leading/trailing spaces from column names if any
buf.columns = bua.columns.str.strip()
bua.columns = bua.columns.str.strip()

data transformation

In [111]:
# Convert 'Annee' to numeric, forcing errors to NaN (if there are any invalid entries)
buf['Annee'] = pd.to_numeric(buf['Annee'], errors='coerce')
buf['Nb pages'] = pd.to_numeric(buf['Nb pages'], errors='coerce')

bua['Annee'] = pd.to_numeric(buf['Annee'], errors='coerce')
bua['Nb pages'] = pd.to_numeric(buf['Nb pages'], errors='coerce')

Check for Missing Values

In [112]:
# Check for missing values in each column
print(buf.isnull().sum())

Cote            0
Titre           0
Auteur         13
Lieu          313
Edition       108
Annee          14
Nb pages       70
Matiere         1
Inventaire      0
dtype: int64


In [113]:
# Check for missing values in each column
print(bua.isnull().sum())

Cote            0
Titre          26
Auteur          0
Lieu          474
Edition       766
Annee           3
Nb pages       16
Matiere        13
Inventaire      2
dtype: int64


Cleaning buf

In [114]:
# Fill missing values with "Inconnu" (Unknown) for categorical fields
buf['Auteur'] = buf['Auteur'].fillna('Inconnu')  # Replace missing authors with "Inconnu"
buf['Lieu'] = buf['Lieu'].fillna('Inconnu')  # Replace missing locations with "Inconnu"
buf['Edition'] = buf['Edition'].fillna('Non spécifié')  # Replace missing editions with "Non spécifié"
buf['Matiere'] = buf['Matiere'].fillna('Inconnu')  # Replace missing subjects with "Inconnu"


# For numerical fields like "Annee" and "Nb pages", use the median or mode
buf['Annee'] = buf['Annee'].fillna(buf['Annee'].median())  # Fill missing years with median
buf['Nb pages'] = buf['Nb pages'].fillna(buf['Nb pages'].median())  # Fill missing pages with median

# Check missing values
print(buf.isnull().sum())

# Save cleaned French dataset
buf.to_csv("cleaned_data/cleaned_buf.csv", index=False, encoding="utf-8")


Cote          0
Titre         0
Auteur        0
Lieu          0
Edition       0
Annee         0
Nb pages      0
Matiere       0
Inventaire    0
dtype: int64


Cleaning bua

In [115]:

#droping rows with missing 'Inventaire' and 'Titre' values
bua = bua.dropna(subset=['Titre'])
bua = bua.dropna(subset=['Inventaire'])


#filling missing values with Arabic terms for categorical columns
bua['Lieu'] = bua['Lieu'].fillna('غير محدد')  # Replace missing locations with "غير محدد"
bua['Edition'] = bua['Edition'].fillna('غير محدد')  # Replace missing editions with "غير محدد"
bua['Matiere'] = bua['Matiere'].fillna('غير محدد')  # Replace missing subjects with "غير محدد"


# For numerical columns, use the median to fill missing values
bua['Annee'] = bua['Annee'].fillna(bua['Annee'].median())  # Fill missing years with median
bua['Nb pages'] = bua['Nb pages'].fillna(bua['Nb pages'].median())  # Fill missing pages with median

# Check if any missing values remain
print(bua.isnull().sum())

# Save the cleaned Arabic dataset
bua.to_csv("cleaned_data/cleaned_bua.csv", index=False, encoding="utf-8")


Cote          0
Titre         0
Auteur        0
Lieu          0
Edition       0
Annee         0
Nb pages      0
Matiere       0
Inventaire    0
dtype: int64
