# Task 1: Importing Libraries and Data Set

## Importing Libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score
from sklearn.model_selection import cross_val_score

## Loading the Data

In [2]:
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


# Task 2:	Perform data cleaning and handle missing values, duplicates, and outliers

## Checking for missing values


In [3]:
df.isna().any()

symboling            False
normalized_losses     True
make                 False
fuel_type            False
aspiration           False
num_doors             True
body_style           False
drive_wheels         False
engine_location      False
wheel_base           False
length               False
width                False
height               False
curb_weight          False
engine_type          False
num_cylinders        False
engine_size          False
fuel_system          False
bore                  True
stroke                True
compression_ratio    False
horsepower            True
peak_rpm              True
city_mpg             False
highway_mpg          False
price                 True
dtype: bool

## Handling missing values

In [4]:
na_cols = df.isna().any()
na_cols = na_cols[na_cols == True].reset_index()
na_cols = na_cols["index"].tolist()
for col in df.columns[1:]:
     if col in na_cols:
        if df[col].dtype != 'object':
             df[col] =  df[col].fillna(df[col].mean()).round(0)

In [5]:
df.isna().any()

symboling            False
normalized_losses    False
make                 False
fuel_type            False
aspiration           False
num_doors             True
body_style           False
drive_wheels         False
engine_location      False
wheel_base           False
length               False
width                False
height               False
curb_weight          False
engine_type          False
num_cylinders        False
engine_size          False
fuel_system          False
bore                 False
stroke               False
compression_ratio    False
horsepower           False
peak_rpm             False
city_mpg             False
highway_mpg          False
price                False
dtype: bool

num_doors still has missing values

In [6]:
df["num_doors"].value_counts()

four    114
two      89
Name: num_doors, dtype: int64

In [7]:
df = df.fillna({"num_doors": "four"}) #filling missing values with four because it is most common value

In [8]:
df.isna().any()

symboling            False
normalized_losses    False
make                 False
fuel_type            False
aspiration           False
num_doors            False
body_style           False
drive_wheels         False
engine_location      False
wheel_base           False
length               False
width                False
height               False
curb_weight          False
engine_type          False
num_cylinders        False
engine_size          False
fuel_system          False
bore                 False
stroke               False
compression_ratio    False
horsepower           False
peak_rpm             False
city_mpg             False
highway_mpg          False
price                False
dtype: bool

## Detecting Outliers in data

In [9]:
for cols in df.columns:
  if df[col].dtype != 'object':
    print('Unique values in ',cols)
    print(df[col].unique())

Unique values in  symboling
[13495. 16500. 13950. 17450. 15250. 17710. 18920. 23875. 13207. 16430.
 16925. 20970. 21105. 24565. 30760. 41315. 36880.  5151.  6295.  6575.
  5572.  6377.  7957.  6229.  6692.  7609.  8558.  8921. 12964.  6479.
  6855.  5399.  6529.  7129.  7295.  7895.  9095.  8845. 10295. 12945.
 10345.  6785. 11048. 32250. 35550. 36000.  5195.  6095.  6795.  6695.
  7395. 10945. 11845. 13645. 15645.  8495. 10595. 10245. 10795. 11245.
 18280. 18344. 25552. 28248. 28176. 31600. 34184. 35056. 40960. 45400.
 16503.  5389.  6189.  6669.  7689.  9959.  8499. 12629. 14869. 14489.
  6989.  8189.  9279.  5499.  7099.  6649.  6849.  7349.  7299.  7799.
  7499.  7999.  8249.  8949.  9549. 13499. 14399. 17199. 19699. 18399.
 11900. 13200. 12440. 13860. 15580. 16900. 16695. 17075. 16630. 17950.
 18150. 12764. 22018. 32528. 34028. 37028.  9295.  9895. 11850. 12170.
 15040. 15510. 18620.  5118.  7053.  7603.  7126.  7775.  9960.  9233.
 11259.  7463. 10198.  8013. 11694.  5348.  6338.

## Performing Data Encoding

### Manually encoding num_doors and num_cylinders

In [10]:
cleanup_nums = {"num_doors":     {"four": 4, "two": 2},
                "num_cylinders": {"four": 4, "six": 6, "five": 5, "eight": 8,
                                  "two": 2, "twelve": 12, "three":3 }}
df = df.replace(cleanup_nums)
df.head()                                  

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,122.0,alfa-romero,gas,std,2,convertible,rwd,front,88.6,...,130,mpfi,3.0,3.0,9.0,111.0,5000.0,21,27,13495.0
1,3,122.0,alfa-romero,gas,std,2,convertible,rwd,front,88.6,...,130,mpfi,3.0,3.0,9.0,111.0,5000.0,21,27,16500.0
2,1,122.0,alfa-romero,gas,std,2,hatchback,rwd,front,94.5,...,152,mpfi,3.0,3.0,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,4,sedan,fwd,front,99.8,...,109,mpfi,3.0,3.0,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,4,sedan,4wd,front,99.4,...,136,mpfi,3.0,3.0,8.0,115.0,5500.0,18,22,17450.0


### Label Encoding Binary Data

In [11]:
#Create a label encoder object
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# Label Encoding will be used for columns with 2 or less unique values
le_count = 0
for col in df.columns[1:]:
    if df[col].dtype == 'object':
        if len(list(df[col].unique())) <= 2:
            le.fit(df[col])
            df[col] = le.transform(df[col])
            le_count += 1
print('{} columns were label encoded.'.format(le_count)) 

3 columns were label encoded.


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized_losses  205 non-null    float64
 2   make               205 non-null    object 
 3   fuel_type          205 non-null    int64  
 4   aspiration         205 non-null    int64  
 5   num_doors          205 non-null    int64  
 6   body_style         205 non-null    object 
 7   drive_wheels       205 non-null    object 
 8   engine_location    205 non-null    int64  
 9   wheel_base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb_weight        205 non-null    int64  
 14  engine_type        205 non-null    object 
 15  num_cylinders      205 non-null    int64  
 16  engine_size        205 non

In [13]:
pip install category-encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:
# import category encoders

import category_encoders as ce
# encode variables with ordinal encoding

encoder = ce.OrdinalEncoder(cols=["body_style", "drive_wheels","make","engine_type","fuel_system"])
df = encoder.fit_transform(df)
df.sample(10)


Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
15,0,122.0,3,1,0,4,3,1,0,103.5,...,209,1,4.0,3.0,8.0,182.0,5400.0,16,22,30760.0
196,-2,103.0,22,1,0,4,3,1,0,104.3,...,141,1,4.0,3.0,9.5,114.0,5400.0,24,28,15985.0
19,1,98.0,4,1,0,2,2,2,0,94.5,...,90,2,3.0,3.0,9.6,70.0,5400.0,38,43,6295.0
2,1,122.0,1,1,0,2,2,1,0,94.5,...,152,1,3.0,3.0,9.0,154.0,5000.0,19,26,16500.0
168,2,134.0,20,1,0,2,5,1,0,98.4,...,146,1,4.0,4.0,9.3,116.0,4800.0,24,30,9639.0
120,1,154.0,15,1,0,4,2,2,0,93.7,...,90,2,3.0,3.0,9.4,68.0,5500.0,31,38,6229.0
182,2,122.0,21,0,0,2,3,2,0,97.3,...,97,7,3.0,3.0,23.0,52.0,4800.0,37,46,7775.0
152,1,74.0,20,1,0,4,2,2,0,95.7,...,92,2,3.0,3.0,9.0,62.0,4800.0,31,38,6488.0
126,3,122.0,16,1,0,2,5,1,1,89.5,...,194,1,4.0,3.0,9.5,207.0,5900.0,17,25,32528.0
89,1,128.0,13,1,0,2,3,2,0,94.5,...,97,2,3.0,3.0,9.4,69.0,5200.0,31,37,5499.0


In [15]:
df.info()
non_feature = df.copy()
non_feature.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized_losses  205 non-null    float64
 2   make               205 non-null    int64  
 3   fuel_type          205 non-null    int64  
 4   aspiration         205 non-null    int64  
 5   num_doors          205 non-null    int64  
 6   body_style         205 non-null    int64  
 7   drive_wheels       205 non-null    int64  
 8   engine_location    205 non-null    int64  
 9   wheel_base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb_weight        205 non-null    int64  
 14  engine_type        205 non-null    int64  
 15  num_cylinders      205 non-null    int64  
 16  engine_size        205 non

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,122.0,1,1,0,2,1,1,0,88.6,...,130,1,3.0,3.0,9.0,111.0,5000.0,21,27,13495.0
1,3,122.0,1,1,0,2,1,1,0,88.6,...,130,1,3.0,3.0,9.0,111.0,5000.0,21,27,16500.0
2,1,122.0,1,1,0,2,2,1,0,94.5,...,152,1,3.0,3.0,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,2,1,0,4,3,2,0,99.8,...,109,1,3.0,3.0,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,2,1,0,4,3,3,0,99.4,...,136,1,3.0,3.0,8.0,115.0,5500.0,18,22,17450.0


# Task 3 and 5: Feature Extraction

## Feature Extraction based on correlation map

In [16]:
corr_matrix = df.corr()
corr_matrix.style.background_gradient(cmap='coolwarm')

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,height,curb_weight,engine_type,num_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
symboling,1.0,0.46519,-0.118794,0.194311,-0.059866,-0.663595,-0.39897,0.041671,0.212471,-0.531954,-0.357612,-0.232919,-0.541038,-0.227691,0.015797,-0.113129,-0.10579,0.016678,-0.06936,-0.075427,-0.178515,0.07138,0.273678,-0.035823,0.034606,-0.082201
normalized_losses,0.46519,1.0,-0.250975,0.101437,-0.006823,-0.357192,-0.212492,-0.311032,-0.0,-0.056518,0.019209,0.084195,-0.370706,0.097785,-0.185247,0.108081,0.110997,-0.080902,-0.101499,-0.111297,-0.114525,0.203434,0.237748,-0.218749,-0.178221,0.133999
make,-0.118794,-0.250975,1.0,-0.113191,0.054265,0.151516,0.154998,0.004317,0.054608,0.078505,0.119584,0.003783,0.236233,0.024015,0.147577,-0.125921,-0.070918,-0.045111,0.233315,-0.024856,0.138828,-0.053692,-0.218347,0.053642,0.050022,-0.161471
fuel_type,0.194311,0.101437,-0.113191,1.0,-0.401397,-0.188496,-0.15758,0.132257,0.04007,-0.308346,-0.212679,-0.23388,-0.284631,-0.217275,-0.028837,0.024544,-0.069594,-0.682404,-0.093924,-0.280307,-0.984356,0.165169,0.477058,-0.255963,-0.191392,-0.110207
aspiration,-0.059866,-0.006823,0.054265,-0.401397,1.0,0.052803,0.021276,-0.066465,-0.057191,0.257611,0.234539,0.300567,0.087311,0.324902,0.014505,-0.047972,0.108217,0.473355,0.334397,0.260573,0.295541,0.240212,-0.183626,-0.202362,-0.254416,0.177285
num_doors,-0.663595,-0.357192,0.151516,-0.188496,0.052803,1.0,0.450456,0.104877,-0.139129,0.439635,0.385675,0.197735,0.540286,0.19072,0.035782,-0.019503,0.013919,-0.024635,0.025793,-0.013026,0.171797,-0.128176,-0.240297,-0.014271,-0.037452,0.041945
body_style,-0.39897,-0.212492,0.154998,-0.15758,0.021276,0.450456,1.0,-0.056886,0.126546,0.368772,0.376842,0.191511,0.483675,0.240367,0.127736,0.106245,0.172381,-0.054724,0.226081,0.074287,0.156686,0.048094,-0.14691,-0.095826,-0.126939,0.178642
drive_wheels,0.041671,-0.311032,0.004317,0.132257,-0.066465,0.104877,-0.056886,1.0,-0.147865,-0.459745,-0.485649,-0.470751,0.019719,-0.575111,0.192637,-0.314006,-0.524307,0.036437,-0.335491,-0.147605,-0.127479,-0.516973,0.039719,0.449581,0.45222,-0.576867
engine_location,0.212471,-0.0,0.054608,0.04007,-0.057191,-0.139129,0.126546,-0.147865,1.0,-0.18779,-0.050989,-0.051698,-0.106234,0.050468,0.323697,0.183048,0.196826,-0.082672,0.178851,-0.045045,-0.019762,0.317618,0.198401,-0.153487,-0.102026,0.331013
wheel_base,-0.531954,-0.056518,0.078505,-0.308346,0.257611,0.439635,0.368772,-0.459745,-0.18779,1.0,0.874587,0.795144,0.589435,0.776386,-0.136452,0.339507,0.569329,0.004828,0.373836,0.197786,0.249786,0.351985,-0.3607,-0.470414,-0.544082,0.583168


## Analysis of the above correlation map



*   highway_mpg and city_mpg are heavily correlated so we will drop the city_mpg column beacuse highway_mpg has higher correlation with price.
*   wheel_base and length are heavily correlated so we will drop wheel_base column because length has higher correlation with price.
*  num_doors, symboling and compression_ratio has very low correlation with price. We will drop these columns as well.
* curb_weight also has high correlation with length, width and engine_size. So we will drop this column as well.



In [17]:
df = df.drop('highway_mpg',axis=1)
df = df.drop(['wheel_base','num_doors','symboling','compression_ratio','curb_weight'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   normalized_losses  205 non-null    float64
 1   make               205 non-null    int64  
 2   fuel_type          205 non-null    int64  
 3   aspiration         205 non-null    int64  
 4   body_style         205 non-null    int64  
 5   drive_wheels       205 non-null    int64  
 6   engine_location    205 non-null    int64  
 7   length             205 non-null    float64
 8   width              205 non-null    float64
 9   height             205 non-null    float64
 10  engine_type        205 non-null    int64  
 11  num_cylinders      205 non-null    int64  
 12  engine_size        205 non-null    int64  
 13  fuel_system        205 non-null    int64  
 14  bore               205 non-null    float64
 15  stroke             205 non-null    float64
 16  horsepower         205 non

# Task 4: Perform feature scaling and transform the data

## Splitting the dataset into test and train

In [18]:
#Identify response variable:
response = df["price"]
df = df.drop(columns="price")
df.isnull().values.any()

False

In [19]:
response1 = non_feature['price']
non_feature = non_feature.drop(columns="price")
non_feature.isnull().values.any()

False

## Performing Feature Scaling

In [20]:
scaler = StandardScaler()
scaler.fit(df)

Scaling dataset with no feature extraction

In [21]:
scaler1 = StandardScaler()
scaler1.fit(non_feature)
X_train1, X_test1, y_train1, y_test1 = train_test_split(non_feature, response1, test_size = 0.2, random_state = 0)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df, response, test_size = 0.2, random_state = 0)
#to resolve any class imbalance - use stratify parameter.
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

Number transactions X_train dataset:  (164, 19)
Number transactions y_train dataset:  (164,)
Number transactions X_test dataset:  (41, 19)
Number transactions y_test dataset:  (41,)


# Task 6: Evaluate performance of machine learning model

## Building Logistic Regression Model

In [23]:
# Fitting Logistic Regression to the Training set
classifier = LinearRegression()
classifier.fit(X_train,y_train)

Fitting the linear regression model on data with no feature extraction

In [24]:
classifier1 = LinearRegression()
classifier1.fit(X_train1,y_train1)

## Evaluating Performance

In [25]:
#Evaluate results
classifier.score(X_test,y_test)

0.6968500805853715

In [26]:
classifier1.score(X_test1,y_test1)

0.6679123239494096