In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from thefuzz import fuzz
from thefuzz import process

cars = pd.read_csv("./concatenated_data.csv", index_col=0)
del cars["description"]
cars.columns=['brand', 'model', 'price', 'mileage', 'year', 'location',
'date_added', 'body_type', 'fuel', 'power', 'transmission', 'color',
'price_new', 'engine_size']
cars.info()


price -> abs(price)


In [None]:

cars["price"] = abs(cars["price"])


If price is 0 -> price = nan


In [None]:
price_is_0 = cars["price"] == 0
cars.loc[price_is_0, "price"] = np.nan


If price less than 500 -> price *= 1000


In [None]:

price_less_than_1000 = cars["price"] <= 500
cars.loc[price_less_than_1000, "price"] *= 1000


In [None]:
# If price over 1_000_000 -> price = nan

price_over_1000000 = cars["price"] >= 1e8
cars.loc[price_over_1000000, "price"] = np.nan


In [None]:
ax = sns.boxplot(data=cars[["price"]], orient="h")
ax.ticklabel_format(style="plain", axis="x")
plt.show()


In [None]:
q1 = np.percentile(cars.loc[cars["price"].notna(), "price"], 25)
q3 = np.percentile(cars.loc[cars["price"].notna(), "price"], 75)

iqr = q3 - q1

lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
print(lower_bound)


In [None]:
cars["price"].hist(bins=5)
plt.show()


In [None]:
cars.loc[cars["price"] > 400_000]


In [None]:
sns.boxplot(data=cars[["price"]], orient="h")
plt.show()


In [None]:
# kilometrage -> abs(kilometrage)

cars["mileage"] = abs(cars["mileage"])


In [None]:
# If mileage < 10 -> mileage = nan

mileage_less_than_10 = cars["mileage"] < 10
cars.loc[mileage_less_than_10, "mileage"] = np.nan


In [None]:
# If mileage <= 1000 -> mileage *= 1000

mileage_less_than_1000 = cars["mileage"] <= 1000
cars.loc[mileage_less_than_1000, "mileage"] *= 1000


In [None]:
# If mileage >= 1 000 000 -> mileage = nan

mileage_over_1000000 = cars["mileage"] >= 1000000
cars.loc[mileage_over_1000000, "mileage"] = np.nan


Applying str.title() to each column


In [None]:

for col in cars.columns:
    if cars[col].dtype == "object":
        cars[col] = cars[col].str.title()


In [None]:
# Exploring the values of "location" column

location_values = cars["location"].unique()
print(location_values, len(location_values))


In [None]:
# Unify and fix the values of the "location" column

my_dict = {
    "Béja": "Beja",
    "La Manouba": "Manouba",
    "Gabès": "Gabes",
    "Médenine": "Medenine",
    "Kébili": "Kebili",
}
cars.replace({"location": my_dict}, inplace=True)


In [None]:
#specifying the field datatype
cars.color=cars.color.astype(str)
# Exploring the values of the "color" column
colors = cars["color"].unique()
print(colors, len(colors))
len(cars[cars["color"].isna()])


In [None]:
#get all the words that match "gris"
matches = process.extract("gris", colors, limit=30, scorer=fuzz.token_sort_ratio)
#take a look at them to choose min_ratio
matches

In [None]:
colors_gris = ["gris" if c.lower().startswith("gris") else c for c in colors]

colors_gris=[c[:-4] if c.lower().endswith("gris")  else c for c in colors_gris]
colors_gris
matches = process.extract("Gris a", colors, limit=30, scorer=fuzz.token_sort_ratio)
score = fuzz.token_sort_ratio("gris ", "Gris Anthracite")
print(score)
matches

In [None]:
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
    
    # let us know the function's done
    print("All done!")
    print(strings, len(strings))

In [None]:

def replace_colors(df,column, color):
    shades= [c for c in column if c.lower().strip().startswith(color) ]
    rows_with_matches=df["color"].isin(shades)
    df.loc[rows_with_matches, "color"]=color



     

In [None]:
replace_colors(cars,colors,"gris")
replace_colors(cars,colors,"bleu")
replace_colors(cars,colors,"blanc")
replace_colors(cars,colors,"vert")
replace_colors(cars,colors,"rouge")
replace_colors(cars,colors,"noir")


In [None]:
colors = cars["color"].unique()
print(colors, len(colors))
len(cars[cars["color"].isna()])

Unify and fix the values of the "color" column


In [None]:
my_dict = {
    "Argent": "Gris",
    "Feuille D'Argent": "Gris",
    "Titanium": "Gris",
    "Miel": "Doré",
    "Champagne": "Jaune",
    "Grenadine": "Rouge",
    "Rouge Bordeaux": "Rouge",
    "Mauve Gris": "Mauve",
    "Aubergine": "Violet",
    "Maran": "Marron",
    "Corail": "Orange",
}
cars.replace({"couleur": my_dict}, inplace=True)

In [None]:

my_dict = {
    "Blanche": "Blanc",
    "Blanc Cristal": "Blanc",
    "Grise": "Gris",
    "Gris Métallisé": "Gris",
    "Grise Metalisee": "Gris",
    "Grise Métallisée": "Gris",
    "Gris Clair Métallisé": "Gris",
    "Gris Foncé": "Gris",
    "Gris Charbon": "Gris",
    "Gris Magnetic": "Gris",
    "Bleu Gris": "Bleu",
    "Bleu/Gris": "Bleu",
    "Bleu Roi": "Bleu",
    "Bleu Métallisée": "Bleu",
    "Gris Titanium": "Argent",
    "Gris Titanuim": "Argent",
    "Gris Argent": "Argent",
    "Feuille D'Argent": "Argent",
    "Titanium": "Argent",
    "Miel": "Jaune",
    "Champagne": "Jaune",
    "Grenadine": "Rouge",
    "Rouge Bordeaux": "Rouge",
    "Mauve Gris": "Violet",
    "Aubergine": "Violet",
    "Noire": "Noir",
    "Maran": "Marron",
    "Corail": "Orange",
}
cars.replace({"color": my_dict}, inplace=True)


In [None]:
# If color is "*****" -> color = nan

specified_color = cars["color"] == "*****"
cars[specified_color] = np.nan

In [None]:
# Remove month from year in the "year" column and convert it to float

contains_hyphen = cars["year"].str.contains("-") & cars["year"].notna()
cars.loc[contains_hyphen, "year"] = cars.loc[contains_hyphen, "year"].apply(
    lambda string: string.split("-")[1]
)
contains_point = cars["year"].str.contains("[.]") & cars["year"].notna()
cars.loc[contains_point, "year"] = cars.loc[contains_point, "year"].apply(
    lambda string: string.split(".")[-1]
)
cars["year"] = cars["year"].astype(np.float64)

In [None]:
# If 10<= year <= 23 -> year += 2000, if 60 <= year -> year += 1900, else year = nan

year_between_0_and_23 = (cars["year"] >= 10) & (cars["year"] <= 23)
cars.loc[year_between_0_and_23, "year"] += 2000
year_more_than_60 = (cars["year"] >= 60) & (cars["year"] <= 99)
cars.loc[year_more_than_60, "year"] += 1900
correct_year = (cars["year"] >= 1960) & (cars["year"] <= 2023)
cars.loc[~correct_year, "year"] = np.nan


In [None]:
# Convert N.D to nan in the "power" column

power_is_ND = cars["power"] == "N.D"
cars.loc[power_is_ND, "power"] = np.nan

In [None]:
# Convert the column "power" to float

contains_CV = cars["power"].str.contains("C") & cars["power"].notna()
cars.loc[contains_CV, "power"] = cars.loc[contains_CV, "power"].apply(
    lambda string: string.split()[0]
)
cars["power"] = cars["power"].astype(np.float64)


In [None]:
# If power > 200 -> power = nan

power_over_200 = cars["power"] > 200
cars.loc[power_over_200, "power"] = np.nan


In [None]:
# Convert the column "engine_size" to float
contains_symbol = cars["engine_size"].str.contains("[><]") & cars["engine_size"].notna()
cars.loc[contains_symbol, "engine_size"] = cars.loc[contains_symbol, "engine_size"].apply(
    lambda string: string[1:]
)
cars.loc[cars["engine_size"].notna(), "engine_size"] = cars.loc[
    cars["engine_size"].notna(), "engine_size"
].apply(lambda string: string[:-1])
cars["engine_size"] = cars["engine_size"].astype(np.float64)


In [None]:
# Turn "date_added" column to datetime

cars["date_added"] = pd.to_datetime(cars["date_added"])

In [None]:
# Dropping duplicates

cars.drop_duplicates(inplace=True)


In [None]:
cars.to_csv("./new_clean_data.csv")
cars.info()
