In [1]:
import pandas as pd # تضمين المكتبة في المشروع

data_frame = pd.read_csv('vehicles.csv') # قراءة الملف
print('\t\tData info\n')
data_frame.info() # طباعة معلومات البيانات من حيث الأعمدة ونوع بياناتها وعدد القيم الغير فارغة
print('=====================================================\n\t\tData description\n')
data_frame.describe(include='all') # طباعة معلومات إحصائية عن البيانات

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426880 entries, 0 to 426879
Data columns (total 17 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   price         426880 non-null  int64  
 1   year          425675 non-null  float64
 2   manufacturer  409234 non-null  object 
 3   model         421603 non-null  object 
 4   condition     252776 non-null  object 
 5   cylinders     249202 non-null  object 
 6   fuel          423867 non-null  object 
 7   odometer      422480 non-null  float64
 8   title_status  418638 non-null  object 
 9   transmission  424324 non-null  object 
 10  drive         296313 non-null  object 
 11  size          120519 non-null  object 
 12  type          334022 non-null  object 
 13  paint_color   296677 non-null  object 
 14  state         426880 non-null  object 
 15  lat           420331 non-null  float64
 16  long          420331 non-null  float64
dtypes: float64(4), int64(1), object(12)
memory usage

In [2]:
# حذف العواميد الغير ضروية
for column_name in ['id', 'url', 'region_url', 'county', 'image_url', 'description', 'posting_date', 'VIN', 'region']:
  data_frame = data_frame.drop(column_name, axis=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426880 entries, 0 to 426879
Data columns (total 17 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   price         426880 non-null  float64
 1   year          426880 non-null  float64
 2   manufacturer  426880 non-null  object 
 3   model         426880 non-null  object 
 4   condition     426880 non-null  object 
 5   cylinders     426880 non-null  object 
 6   fuel          426880 non-null  object 
 7   odometer      426880 non-null  float64
 8   title_status  426880 non-null  object 
 9   transmission  426880 non-null  object 
 10  drive         426880 non-null  object 
 11  size          426880 non-null  object 
 12  type          426880 non-null  object 
 13  paint_color   426880 non-null  object 
 14  state         426880 non-null  object 
 15  lat           426880 non-null  float64
 16  long          426880 non-null  float64
dtypes: float64(5), object(12)
memory usage: 55.4+ MB

In [None]:
import numpy as np # تضمين مكتبة تفيدنا بالتعامل مع المصفوفات والأرقام في بايثون

numerical_columns = ['odometer', 'lat', 'long', 'price', 'year']
categorical_columns = ['manufacturer', 'condition', 'cylinders', 'fuel', 'title_status', 'transmission', 'drive', 'size', 'type', 'paint_color', 'model', 'state']

data_frame.loc[data_frame['year'] % 1 != 0, 'year'] = np.nan

for column_name in numerical_columns:
    data_frame.loc[data_frame[column_name] < 0, column_name] = np.nan # استدبال القيم الأصغر من صفر بالقيمة nan
    threshold = data_frame[column_name].std() * 2 # حساب العتبة المسموح بها فوق المتسوط الحسابي
    max_value = data_frame[column_name].mean() + threshold # حساب القيمة العظمى المسموحة بالعمود
    data_frame.loc[data_frame[column_name] > max_value, column_name] = max_value # استبدال القيم الأكبر من القيمة المنطقية العظمى بالقيمة العظمى
    data_frame[column_name].fillna(data_frame[column_name].mean(), inplace=True) # استبدال القيم الفارغة بالمتوسط الحسابي

for column_name in categorical_columns:
    data_frame[column_name].fillna(data_frame[column_name].mode()[0], inplace=True) # استبدال القيم الفارغة بالمنوال

data_frame.info()

In [None]:
from sklearn.preprocessing import LabelEncoder # تضمين المكتبة المسؤولة عن ترميز البيانات الفئوية

label_encoders = {}
for column_name in categorical_columns:
    label_encoders[column_name] = LabelEncoder()
    data_frame[column_name] = label_encoders[column_name].fit_transform(data_frame[column_name]) # ترميز البيانات واستبدالها ضمن إطار البيانات الحالي

In [None]:
X = data_frame.drop('type', axis=1)
y = data_frame['type']

In [None]:
import matplotlib.pyplot as plt
ax = plt.subplots(figsize=(6, 6))[1]

cv = y.value_counts()

original_labels = label_encoders['type'].classes_ # قراءة القيم الأصلية الفئوية للعمود نوع السيارة

ax.pie(x=cv, labels= original_labels, autopct='%1.1f%%')
ax.set_title('Original classes distribution', fontsize=14)
plt.draw()

In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE()

X, y = oversample.fit_resample(X, y)

cv = y.value_counts()

ax = plt.subplots(figsize=(6, 6))[1]
ax.pie(x=cv, labels= original_labels, autopct='%1.1f%%')
ax.set_title('New classes distribution', fontsize=14)

In [None]:
data_frame = pd.concat([X, y], axis=1)
print('\t\tData info\n')
data_frame.info()
print('=====================================================\n\t\tData description\n')
print(data_frame.describe())
relation = X.corrwith(y, method='pearson')
print('=====================================================\n\t\tPearson relathion\n')
print(relation)

In [None]:
for column_name in [*categorical_columns, *numerical_columns]:
    print(column_name, len(data_frame[column_name].unique()))

In [None]:
from sklearn.preprocessing import StandardScaler
columns_to_scale = ['odometer', 'lat', 'price', 'year']
scaler = StandardScaler()
data_frame[columns_to_scale] = scaler.fit_transform(data_frame[columns_to_scale])

X = data_frame.drop('type', axis=1)
y = data_frame['type']

In [None]:
# split data for training and testing library
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

In [None]:
# Logistic regression classifier
from sklearn.linear_model import LogisticRegression
# instance
logisticRegressionModel = LogisticRegression(solver='sag')

#start training
logisticRegressionModel.fit(X_train, y_train)

# calculate classification result with test data
logisticRegression_y_pred = logisticRegressionModel.predict(X_test)

In [None]:
# calculate evaluation Criteria by model training result with testing result
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
logisticRegression_scores = {
    'Accuracy': accuracy_score(y_test, logisticRegression_y_pred),
    'Precision': precision_score(y_test, logisticRegression_y_pred, average='micro'),
    'Recall': recall_score(y_test, logisticRegression_y_pred, average='micro'),
    'F1': f1_score(y_test, logisticRegression_y_pred, average='micro')
}

for key in logisticRegression_scores:
        logisticRegression_scores[key] = '{:.2f}'.format(100 * logisticRegression_scores[key])

printString = "\t\tLogistic Regression scores\n"
for key in logisticRegression_scores:
        printString += "{:^15} |".format(key)
printString += "\n-------------------------------------------------------------------------------------\n"
for key in logisticRegression_scores:
    printString += "{:^15} |".format(logisticRegression_scores[key])
print(printString)

In [None]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
# instance
decisionTreeModel = DecisionTreeClassifier()

#start training
decisionTreeModel.fit(X_train, y_train)

# calculate classification result with test data
decisionTree_y_pred = decisionTreeModel.predict(X_test)

In [None]:
# calculate evaluation Criteria by model training result with testing result
decisionTree_scores = {
    'Accuracy': accuracy_score(y_test, decisionTree_y_pred),
    'Precision': precision_score(y_test, decisionTree_y_pred, average='micro'),
    'Recall': recall_score(y_test, decisionTree_y_pred, average='micro'),
    'F1': f1_score(y_test, decisionTree_y_pred, average='micro')
}

for key in decisionTree_scores:
        decisionTree_scores[key] = '{:.2f}'.format(100 * decisionTree_scores[key])

printString = "\t\tDecision Tree scores\n"
for key in decisionTree_scores:
        printString += "{:^15} |".format(key)
printString += "\n-------------------------------------------------------------------------------------\n"
for key in decisionTree_scores:
    printString += "{:^15} |".format(decisionTree_scores[key])
print(printString)

In [None]:
# Bayes classifier
from sklearn.naive_bayes import CategoricalNB, GaussianNB
# instance
categorical_model = CategoricalNB(alpha=0)
numerical_model = GaussianNB()

# split data
X_train_numerical = X_train[numerical_columns]
X_train_categorical = X_train[[column_name for column_name in categorical_columns if column_name != 'type']]

#start training
categorical_model.fit(X_train_categorical, y_train)
numerical_model.fit(X_train_numerical, y_train)

# calculate classification result with test data
X_test_numerical = X_test[numerical_columns]
X_test_categorical = X_test[[column_name for column_name in categorical_columns if column_name != 'type']]
categorical_pred = categorical_model.predict(X_test_categorical)
numerical_pred = numerical_model.predict(X_test_numerical)

combined_preds = np.vstack((categorical_pred, numerical_pred)).T
bayes_y_pred = final_preds = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=combined_preds)

In [None]:
# calculate evaluation Criteria by model training result with testing result
categorical_scores = {
    'Accuracy': accuracy_score(y_test, categorical_pred),
    'Precision': precision_score(y_test, categorical_pred, average='macro'),
    'Recall': recall_score(y_test, categorical_pred, average='macro'),
    'F1': f1_score(y_test, categorical_pred, average='macro')
}
numerical_scores = {
    'Accuracy': accuracy_score(y_test, numerical_pred),
    'Precision': precision_score(y_test, numerical_pred, average='macro'),
    'Recall': recall_score(y_test, numerical_pred, average='macro'),
    'F1': f1_score(y_test, numerical_pred, average='macro')
}
bayes_scores = {
    'Accuracy': accuracy_score(y_test, bayes_y_pred),
    'Precision': precision_score(y_test, bayes_y_pred, average='macro'),
    'Recall': recall_score(y_test, bayes_y_pred, average='macro'),
    'F1': f1_score(y_test, bayes_y_pred, average='macro')
}

for key in bayes_scores:
        bayes_scores[key] = '{:.2f}'.format(100 * bayes_scores[key])

printString = "\t\tNaive Bayes scores\n"
for key in bayes_scores:
        printString += "{:^15} |".format(key)
printString += "\n-------------------------------------------------------------------------------------\n"
for key in bayes_scores:
    printString += "{:^15} |".format(bayes_scores[key])
print(printString)

for key in categorical_scores:
        categorical_scores[key] = '{:.2f}'.format(100 * categorical_scores[key])

printString = "\n\t\tCategorical Naive Bayes scores\n"
for key in categorical_scores:
        printString += "{:^15} |".format(key)
printString += "\n-------------------------------------------------------------------------------------\n"
for key in categorical_scores:
    printString += "{:^15} |".format(categorical_scores[key])
print(printString)

for key in numerical_scores:
        numerical_scores[key] = '{:.2f}'.format(100 * numerical_scores[key])

printString = "\n\t\tNumerical Naive Bayes scores\n"
for key in numerical_scores:
        printString += "{:^15} |".format(key)
printString += "\n-------------------------------------------------------------------------------------\n"
for key in numerical_scores:
    printString += "{:^15} |".format(numerical_scores[key])
print(printString)

In [None]:
import matplotlib.pyplot as plt

# plot the distribution of all numerical columns
for column_name in numerical_columns:
    plt.hist(data_frame[column_name], bins=50)
    plt.xlabel(column_name)
    plt.ylabel('Frequency')
    plt.title(f'Distribution of {column_name}')
    plt.show()