#                                    Preproseccing of Data

### Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from category_encoders import TargetEncoder
from sklearn.preprocessing import MinMaxScaler

### Functions

In [2]:
def count_na(some_series):
    """
    Function to find all NAN values in a series
    Input :
        Some series : The series we want to find the NAN values
    Output :
        s : count of all na values in the series
    """
    s = 0
    ser_bool = some_series.isna()
    for i in ser_bool:
        if i :
            s+=1
    return s

In [3]:
def correction_brands ( list_of_brands):
    """
    Correcting the name of the brands that are misspelt
    Input:
        list_of_brands : List of car brands
    
    Output:
        corr_list : List with the names of the brands corrected
    """
    dict_of_corrections = {'chevroelt':'chevrolet', 'chevy':'chevrolet', 'maxda':'mazda', 'vokswagen':'vw', 'volkswagen':'vw', 'toyouta':'toyota', 'mercedes':'mercedes-benz'}
    corr_list = list_of_brands.replace(dict_of_corrections)
    return corr_list

In [4]:
def filling_na (df):
    """
    Filling nan values of a df with the Knn Imputer algorithm
    input:
        Df: dataframe with only numeric values
    Output:
        re: dataframe with all na values filled
    """
    
    imputer = KNNImputer(n_neighbors=3, weights="distance")
    array = imputer.fit_transform(df)
    re=pd.DataFrame(array)
    re.columns=['mpg', 'cylinders', 'displacements', 'horsepower', 'weight', 'acceleration', 'model year', 'origin']
                
    return re

In [5]:
def encoding(some_series, target):
    """
    Enconding with Target encoding the columns with strings
    Input:
        some_series: The attribute we want to encode( as series)
    Output:
        some_series: The attribute encoded
    """
    encoder = TargetEncoder()
    encoded = encoder.fit_transform(some_series, target)
    
    return encoded

In [6]:
def scaler_f(some_series):
    """
    Scalling an attribute with min max scaler
    Input:
        some_series: The attribute we want to scale( as series)
    Output:
        scaled_data: The data of the attribute scaled
    """
    
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(some_series)
    return scaled_data

### Main

In [7]:
#Reading the Data

file_path = '/home/mav24/Documents/Development/Regeneration/Project/Data/mpg.data.xlsx'
data=pd.read_excel(file_path)

In [8]:
# Droping Empty columns and renaming the existing ones for easier use

data.drop(columns=['Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12'], inplace=True)
data.columns=['mpg', 'cylinders', 'displacements', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name']

In [9]:
# Replacing NaN values with in KNN Imputer algorithm

car_name = data['car name']
data = filling_na(data.drop(columns='car name'))

In [10]:
# Creating attribute with the total cc of the cars which is the number of cylinders times 
# the displacements of each cylinder

data['total cc'] = data['cylinders']*data['displacements']
data['car name'] = car_name

In [11]:
# Seperate the car name to brand and correcting it 

car_brand = data['car name'].str.split().str.get(0)
car_brand = correction_brands(car_brand)
data['car brand'] = car_brand

In [12]:
# Encoding categorical attributes with target on MPG 

# Car brand
encoded_car_brand = encoding(data[['car brand']], data[['mpg']])

# Model year
encoded_model_year = encoding(data[['model year']], data[['mpg']])


# Origin
encoded_origin = encoding(data[['origin']], data[['mpg']])

  elif pd.api.types.is_categorical(cols):


In [13]:
# Scaling the data to [0,1]

scaled_cylinders = scaler_f(data[['cylinders']]) 
scaled_displacements = scaler_f(data[['displacements']])
scaled_horsepower = scaler_f(data[['horsepower']])
scaled_weight = scaler_f(data[['weight']])
scaled_acceleration = scaler_f(data[['acceleration']])
scaled_total_ccs = scaler_f(data[['total cc']])

In [14]:
# Scaling the encoded data to [0,1] as well

scaled_encoded_car_brand = scaler_f(encoded_car_brand)
scaled_encoded_model_year = scaler_f(encoded_model_year)
scaled_encoded_origin = scaler_f(encoded_origin)

In [15]:
# Changing the data of our dataframe and dropping attributes we will not use

data['cylinders'] = scaled_cylinders
data['displacements'] = scaled_displacements
data['horsepower'] = scaled_horsepower
data['weight'] = scaled_weight
data['acceleration'] = scaled_acceleration
data['total cc'] = scaled_total_ccs
data['encoded car brand'] = scaled_encoded_car_brand
data['encoded model year'] = scaled_encoded_model_year
data['encoded origin'] = scaled_encoded_origin
data.drop(columns=['model year', 'origin', 'car name', 'car brand'], inplace=True)

In [17]:
# The preprosecced data on the data frame
print (data)

      mpg  cylinders  displacements  horsepower    weight  acceleration  \
0    18.0        1.0       0.617571    0.456522  0.536150      0.238095   
1    15.0        1.0       0.728682    0.646739  0.589736      0.208333   
2    18.0        1.0       0.645995    0.565217  0.516870      0.178571   
3    16.0        1.0       0.609819    0.565217  0.516019      0.238095   
4    17.0        1.0       0.604651    0.510870  0.520556      0.148810   
..    ...        ...            ...         ...       ...           ...   
401  27.0        0.2       0.186047    0.217391  0.333711      0.452381   
402  44.0        0.2       0.074935    0.032609  0.146583      0.988095   
403  32.0        0.2       0.173127    0.206522  0.193365      0.214286   
404  28.0        0.2       0.134367    0.179348  0.286929      0.630952   
405  31.0        0.2       0.131783    0.195652  0.313864      0.678571   

     total cc  encoded car brand  encoded model year  encoded origin  
0    0.654810           0.16