In [30]:
import pandas as pd
import numpy as np

import matplotlib.pyplot  #for visualization
import seaborn  #for plotting boxplot
import sklearn  # contains all model

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split # for testing and splitting dataset


# STEP 2:LOADING THE DATATSET

In [None]:
#TODO:GET THE DATASET. /AI_Invasion_In-Class_Dataset.xlsx from your AI INVASION
#STUDY PACK
#NOTE : you can use panda read_excel to read file with xlsx format

In [32]:
df=pd.read_excel('AI_Invasion_In-Class_Dataset.xlsx')
df.head()

Unnamed: 0,Location,Maker,Model,Year,Colour,Amount (Million ₦),Type,Distance_Km
0,Abuja,Mercedes-Benz,GLA 250,2015.0,Brown,14.5,Foreign Used,50000.0
1,Abuja,Hyundai,Accent,2013.0,Red,1.55,Nigerian Used,
2,Lagos,Lexus,GX 460 Premium,2011.0,White,14.0,Foreign Used,85000.0
3,Lagos,Lexus,ES 350,2011.0,Gray,4.95,Foreign Used,
4,Ibadan,Toyota,Verso 1.6,2009.0,Silver,1.69,Nigerian Used,118906.0


In [None]:
#location, maker, Model, year, model, year, colour are the features i.e variables 
#while the distance_km is the target we want to predict
#supervised learning gives the expected features to use for predicting e.g score, price

In [33]:
df.info()  #syntax for getting the details of your data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4487 entries, 0 to 4486
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Location            4487 non-null   object 
 1   Maker               4487 non-null   object 
 2   Model               4487 non-null   object 
 3   Year                4487 non-null   float64
 4   Colour              4487 non-null   object 
 5   Amount (Million ₦)  4487 non-null   float64
 6   Type                4487 non-null   object 
 7   Distance_Km         2932 non-null   float64
dtypes: float64(3), object(5)
memory usage: 280.6+ KB


In [34]:
df.describe()

Unnamed: 0,Year,Amount (Million ₦),Distance_Km
count,4487.0,4487.0,2932.0
mean,2011.09561,11.309795,101038.3
std,4.823362,20.585915,115091.4
min,1982.0,0.42,1.0
25%,2008.0,3.6,52378.5
50%,2011.0,5.7,79000.0
75%,2014.0,12.0,109939.2
max,2022.0,454.0,1785448.0


# CLEAN THE DATATSET

In [35]:
df.columns  #df means dataset

Index(['Location', 'Maker', 'Model', 'Year', 'Colour', 'Amount (Million ₦)',
       'Type', 'Distance_Km'],
      dtype='object')

In [36]:
#check for missing value
df.isnull().sum()

Location                 0
Maker                    0
Model                    0
Year                     0
Colour                   0
Amount (Million ₦)       0
Type                     0
Distance_Km           1555
dtype: int64

In [37]:
#fill up missing values in Distance_km will the mean
mean_value=df['Distance_Km'].mean()
print(mean_value)

df['Distance_Km'].fillna(mean_value, inplace=True)   #the inplace=true is to make the change permanent

101038.32128240108


In [38]:
#Check and make sure all missing value have been filled
df.isnull().sum()

Location              0
Maker                 0
Model                 0
Year                  0
Colour                0
Amount (Million ₦)    0
Type                  0
Distance_Km           0
dtype: int64

In [39]:
#the main of this section is to rename the different
#class is our categorical feature that were not properly named
#or change the data type of a column

cat_features={'Location', 'Model', 'Maker', 'Year', 'Colour', 'Type'}

for cat_feature in cat_features:
    print(cat_feature, df[cat_feature].unique(), sep=':')
    print('#'*50)

Maker:['Mercedes-Benz' 'Hyundai' 'Lexus' 'Toyota' 'Mazda' 'Honda' 'Land Rover'
 'Porsche' 'Acura' 'Nissan' 'Pontiac' 'Ford' 'Jeep' 'Kia' 'Peugeot' 'BMW'
 'Mitsubishi' 'Dodge' 'Chevrolet' 'Scion' 'Audi' 'Infiniti' 'Mini'
 'Volkswagen' 'Suzuki' 'Chrysler' 'Volvo' 'Rolls-Royce' 'JAC' 'Subaru'
 'Renault' 'GMC' 'Rover' 'IVM' 'Bentley' 'Opel' 'Lincoln' 'Hummer'
 'Saturn' 'Cadillac' 'Lamborghini' 'Buick' 'Smart' 'Jaguar' 'Ferrari'
 'Tata' 'Skoda']
##################################################
Type:['Foreign Used' 'Nigerian Used' 'Brand New']
##################################################
Year:[2015. 2013. 2011. 2009. 2008. 2010. 2014. 2012. 2022. 2006. 2021. 2017.
 2007. 2002. 2016. 2019. 2020. 2004. 2018. 2005. 2003. 2000. 1999. 2001.
 1989. 1998. 1982. 1994. 1993. 1997.]
##################################################
Location:['Abuja' 'Lagos' 'Ibadan']
##################################################
Colour:['Brown' 'Red' 'White' 'Gray' 'Silver' 'Black' 'Blue' 'Gold' 'Green'


In [40]:
#the model is too much and it does not affect the price of the cars, so we drop it
df.drop('Model', axis=1, inplace=True)  #the axis is either 0 or 1, 1=column
df.head()

Unnamed: 0,Location,Maker,Year,Colour,Amount (Million ₦),Type,Distance_Km
0,Abuja,Mercedes-Benz,2015.0,Brown,14.5,Foreign Used,50000.0
1,Abuja,Hyundai,2013.0,Red,1.55,Nigerian Used,101038.321282
2,Lagos,Lexus,2011.0,White,14.0,Foreign Used,85000.0
3,Lagos,Lexus,2011.0,Gray,4.95,Foreign Used,101038.321282
4,Ibadan,Toyota,2009.0,Silver,1.69,Nigerian Used,118906.0


In [41]:
#label Encoding
cat_features=['Location','Maker','Year','Colour','Type']

for cat_feature in cat_features:
    df[f'{cat_feature}_cat']=df[cat_feature].astype('category')
    df[f'{cat_feature}_cat']=df[f'{cat_feature}_cat'].cat.codes
    
    #READ MORE ON PANDAS GET_DUMMIES
    

df.head()

Unnamed: 0,Location,Maker,Year,Colour,Amount (Million ₦),Type,Distance_Km,Location_cat,Maker_cat,Year_cat,Colour_cat,Type_cat
0,Abuja,Mercedes-Benz,2015.0,Brown,14.5,Foreign Used,50000.0,0,26,22,3,1
1,Abuja,Hyundai,2013.0,Red,1.55,Nigerian Used,101038.321282,0,14,20,14,2
2,Lagos,Lexus,2011.0,White,14.0,Foreign Used,85000.0,2,23,18,17,1
3,Lagos,Lexus,2011.0,Gray,4.95,Foreign Used,101038.321282,2,23,18,6,1
4,Ibadan,Toyota,2009.0,Silver,1.69,Nigerian Used,118906.0,1,44,16,15,2


In [42]:
#drop the reductant features since label encoding have been done
df.drop(['Location','Maker','Year','Colour','Type'], axis=1, inplace=True)
df.head()

Unnamed: 0,Amount (Million ₦),Distance_Km,Location_cat,Maker_cat,Year_cat,Colour_cat,Type_cat
0,14.5,50000.0,0,26,22,3,1
1,1.55,101038.321282,0,14,20,14,2
2,14.0,85000.0,2,23,18,17,1
3,4.95,101038.321282,2,23,18,6,1
4,1.69,118906.0,1,44,16,15,2


In [None]:
#features are the X= others and label to Y= amount in Million (we are predicting Y)

In [50]:
y=df['Amount (Million ₦)'] #Target
x=df.drop('Amount (Million ₦)', axis=1)

In [52]:
y

0       14.50
1        1.55
2       14.00
3        4.95
4        1.69
        ...  
4482     4.60
4483     4.50
4484    10.45
4485    31.00
4486    14.00
Name: Amount (Million ₦), Length: 4487, dtype: float64

In [54]:
print(x)

        Distance_Km  Location_cat  Maker_cat  Year_cat  Colour_cat  Type_cat
0      50000.000000             0         26        22           3         1
1     101038.321282             0         14        20          14         2
2      85000.000000             2         23        18          17         1
3     101038.321282             2         23        18           6         1
4     118906.000000             1         44        16          15         2
...             ...           ...        ...       ...         ...       ...
4482   90282.000000             2         23        13           2         1
4483   85000.000000             2         23        14           2         1
4484   65214.000000             0         26        21           7         1
4485   45000.000000             2         23        27           1         1
4486   55000.000000             2         23        22          15         1

[4487 rows x 6 columns]


In [55]:
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.3)

In [56]:
x_train.shape

(3140, 6)

In [57]:
x_test.shape

(1347, 6)

In [58]:
df.shape

(4487, 7)

In [59]:
y_test.shape

(1347,)

# LOAD YOUR DATA INTO THE LINEAR REGRESSION MODEL I.E. TRAIN YOUR MODEL

In [64]:
from sklearn.linear_model import LinearRegression

reg=LinearRegression()
reg.fit(x_train, y_train)

LinearRegression()

# Make prediction

In [65]:
reg.predict(x_test)   #to predict the Y, the result shows the Expected Y predicted by the model

array([16.99494329, 21.69996191,  7.8970363 , ..., 19.64910833,
        6.67042298, -7.97442766])

# Evaluate the model

In [67]:
from sklearn.metrics import mean_absolute_error

#we are using mean_absolute_error because
#this is a regression model

y_pred= reg.predict(x_test)
print('MAE',mean_absolute_error(y_test,y_pred))

MAE 7.588067012225298


In [68]:
from sklearn.tree import DecisionTreeRegressor

dt_reg=DecisionTreeRegressor()
dt_reg.fit(x_train,y_train)
y_pred=dt_reg.predict(x_test)

print('MAE', mean_absolute_error(y_test,y_pred))

MAE 4.804740892636193


# SVM

In [None]:
#SUPPORT VECTOR MACHINE

In [71]:
from sklearn.svm import SVR

sv_reg=SVR()
sv_reg.fit(x_train,y_train)
y_pred=sv_reg.predict(x_test)
print('MAE',mean_absolute_error(y_test,y_pred))

MAE 6.68956481156402
