In [8]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid')


data_train_path = (r"/content/drive/My Drive/Colab Notebooks/Verzeo Internship/Major Project/datasets/Data_Train.xlsx")
data_test_path = (r"/content/drive/My Drive/Colab Notebooks/Verzeo Internship/Major Project/datasets/Data_Test.xlsx")

data_train = pd.read_excel(data_train_path)
data_test = pd.read_excel(data_test_path)
data_train.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


## The Data-Preprocessing part

In [9]:
df1 = data_train[data_train.isna().any(axis=1)]
df2 = data_test[data_test.isna().any(axis=1)]

# Null present as string in the data 
data_test_null_string = data_test[data_test['Power'].str.contains('null' , na = True)]


columns_to_be_filled_with_NaN = ["Power"]

def fill_columns_with_NaN(columns_to_be_filled_with_NaN):
    for i in range (len(columns_to_be_filled_with_NaN)):    
        data_train[columns_to_be_filled_with_NaN[i]] = data_train[columns_to_be_filled_with_NaN[i]].replace(to_replace="[null]" , value = np.NaN , regex = True)
        data_test[columns_to_be_filled_with_NaN[i]] = data_test[columns_to_be_filled_with_NaN[i]].replace(to_replace="[null]" , value = np.NaN , regex = True)

fill_columns_with_NaN(columns_to_be_filled_with_NaN)

empty_cols_list_train = ["Mileage","Engine","Power","Seats"] 
empty_cols_list_test = ["Engine","Power","Seats"] 


def my_Nan_filling_function(dataset , empty_cols_list):
    for i in range(len(empty_cols_list)):
        
        # Replacing Nan with mean values
        # dataset[empty_cols_list[i]].fillna(dataset[empty_cols_list[i]].mean , inplace = True)
        
        # Replacing Nan with mode values
        dataset[empty_cols_list[i]].fillna(dataset[empty_cols_list[i]].mode()[0] , inplace = True)

    print(dataset)

my_Nan_filling_function(data_train , empty_cols_list_train)

my_Nan_filling_function(data_test , empty_cols_list_test)

list_of_columns_with_units = ["Mileage" , "Engine" , "Power"] 

def remove_units_from_columns(list_of_columns_with_units):
    for i in range(len(list_of_columns_with_units)):
        data_train[list_of_columns_with_units[i]] = data_train[list_of_columns_with_units[i]].str.split(" " , expand = True)
        data_train[list_of_columns_with_units[i]] = data_train[list_of_columns_with_units[i]].astype("float")
        data_test[list_of_columns_with_units[i]] = data_test[list_of_columns_with_units[i]].str.split(" " , expand = True)
        data_test[list_of_columns_with_units[i]] = data_test[list_of_columns_with_units[i]].astype("float")

remove_units_from_columns(list_of_columns_with_units)

data_train["Year"] = data_train["Year"].astype("int32" , copy = False)
data_test["Year"] = data_test["Year"].astype("int32" , copy = False)

                                  Name    Location  ...  Seats  Price
0               Maruti Wagon R LXI CNG      Mumbai  ...    5.0   1.75
1     Hyundai Creta 1.6 CRDi SX Option        Pune  ...    5.0  12.50
2                         Honda Jazz V     Chennai  ...    5.0   4.50
3                    Maruti Ertiga VDI     Chennai  ...    7.0   6.00
4      Audi A4 New 2.0 TDI Multitronic  Coimbatore  ...    5.0  17.74
...                                ...         ...  ...    ...    ...
6014                  Maruti Swift VDI       Delhi  ...    5.0   4.75
6015          Hyundai Xcent 1.1 CRDi S      Jaipur  ...    5.0   4.00
6016             Mahindra Xylo D4 BSIV      Jaipur  ...    8.0   2.90
6017                Maruti Wagon R VXI     Kolkata  ...    5.0   2.65
6018             Chevrolet Beat Diesel   Hyderabad  ...    5.0   2.50

[6019 rows x 12 columns]
                                                   Name  ... Seats
0                               Maruti Alto K10 LXI CNG  ...   4.0


## Removing the outliers part

In [10]:
def get_numerical_features(dataset):
    df = dataset
    df_numerics_only = df.select_dtypes(include=np.number)
    # print(df_numerics_only)
    colnames_numerics_only = df.select_dtypes(include=np.number).columns.tolist()
    # print(colnames_numerics_only)
    return colnames_numerics_only

df_train = get_numerical_features(data_train)
print(df_train)

df_test = get_numerical_features(data_test)
print(df_test)

print(data_train.shape)

remove_train_outliers_list = df_train
remove_test_outliers_list = df_test

def remove_outliers(dataset , remove_outliers_list):
    # Iterating across the list of numerical categories for which the outliers have to be removed
    for i in range(len(remove_outliers_list)):
        # Change the value of the outlier criterion for better results
        dataset.drop(dataset[dataset[remove_outliers_list[i]] >=( 0.99999 * dataset[remove_outliers_list[i]].max()) ].index, inplace = True )

remove_outliers(data_train , remove_train_outliers_list)
remove_outliers(data_test , remove_test_outliers_list)

print(data_train.shape)

['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats', 'Price']
['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats']
(6019, 12)
(5903, 12)


## Skipping the EDA part (in the other notebook)

In [11]:
"""
 Summary of some of the vairables
"""

'''Actual train data '''
# data_train  
'''Actual train data xlsx file '''
# data_train_path   
'''Actual train data csv file '''
# data_train_csv_path 
''' Only Numerical Features of train data  '''
# df_train  


'''Actual test data '''
# data_test  
'''Actual test data xlsx file '''
# data_test_path   
'''Actual test data csv file '''
# data_test_csv_path 
''' Only Numerical Features of test data  '''
# df_test  


' Only Numerical Features of test data  '

# Feature Engineering and Feature Selection

## A. Feature Selection with Filtering Methods

- Univarite -> Fisher Score , Mutual Information Gain , Variance , etc.
- Multivariate -> Pearson Correlation

### Univariate Feature Selection

- Constant, Quasi-Constant and Duplicate Feature Removal


In [0]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold

In [13]:
data_train.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74


In [14]:
'''
CONSTANT FEATURES REMOVAL
 Dropping Categorical variables Temporarily
 This is necessary for this particaular feature selection method
'''
X = data_train.drop(["Name","Location","Fuel_Type", "Transmission" , "Owner_Type" ,  "Price"] , axis = 1)
y = data_train["Price"]

X.shape , y.shape

((5903, 6), (5903,))

In [15]:
# Stratify doesn't work on Regression Problems
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 0 )
X_train.shape , X_test.shape , y_train.shape , y_test.shape 

((4722, 6), (1181, 6), (4722,), (1181,))

In [16]:
constant_filter = VarianceThreshold(threshold = 0 )
constant_filter.fit(X_train)

VarianceThreshold(threshold=0)

In [17]:
# Removing the constant features
constant_filter.get_support().sum()

6

In [18]:
# inverting the filter list
constant_list = [not temp for temp in constant_filter.get_support()]
print(constant_list)

[False, False, False, False, False, False]


In [19]:
# Printing the list of constant features
X.columns[constant_list]

Index([], dtype='object')

In [0]:
'''
 Transforming the dataset into non-constant feaure space
 Bascially removing the constant features
 done to prevent overfitting of the model
 '''
X_train_filter = constant_filter.transform(X_train)
X_test_filter = constant_filter.transform(X_test)

In [21]:
X_train_filter.shape, X_test_filter.shape, X_train.shape , X_test.shape

((4722, 6), (1181, 6), (4722, 6), (1181, 6))

In [0]:
'''
QUASI CONSTANT FEATURE REMOVAL
Removing feature almost constant or somewhat near to constant(Quasi Constant)
'''
# This method removes features with variation below a certain cutoff.
quasi_constant_filter = VarianceThreshold(threshold = 0.01)

In [23]:
quasi_constant_filter.fit(X_train_filter)

VarianceThreshold(threshold=0.01)

In [24]:
quasi_constant_filter.get_support().sum()

6

In [25]:
X_train_quasi_filter = quasi_constant_filter.transform(X_train_filter)
X_test_quasi_filter = quasi_constant_filter.transform(X_test_filter)
X_train.shape,X_test.shape,X_train_filter.shape,X_test_filter.shape, X_train_quasi_filter.shape,X_test_quasi_filter.shape

((4722, 6), (1181, 6), (4722, 6), (1181, 6), (4722, 6), (1181, 6))

In [26]:
'''
DUPLICATE FEATURE REMOVAL
'''
X_train_T = X_train_quasi_filter.T
X_test_T = X_test_quasi_filter.T
type(X_train_T),type(X_test_T)

(numpy.ndarray, numpy.ndarray)

In [27]:
# Changing it back to Pandas Dataframe
X_train_T = pd.DataFrame(X_train_T)
X_test_T = pd.DataFrame(X_test_T)
X_train_T.shape , X_test_T.shape

((6, 4722), (6, 1181))

In [28]:
X_train_T.duplicated().sum()

0

In [29]:
# Finding the duplicated features
duplicated_features = X_train_T.duplicated()
print(duplicated_features)

0    False
1    False
2    False
3    False
4    False
5    False
dtype: bool


In [30]:
# Inverting the dupkicated list to find the features to keep
features_to_keep = [not index for index in duplicated_features]
print(features_to_keep)

[True, True, True, True, True, True]


In [31]:
# After removing constant , quasi-constant and duplicate features
X_train_unique = X_train_T[features_to_keep].T
X_test_unique = X_test_T[features_to_keep].T
X_train_unique.shape,X_train.shape

((4722, 6), (4722, 6))

In [32]:
'''
UNFORTUNATELY,there are:
0 constant
0 quasi-constant 
0 duplicate features
'''

'\nUNFORTUNATELY,there are:\n0 constant\n0 quasi-constant \n0 duplicate features\n'

In [33]:
'''
This is a regression problem NOT classification
'''

# def run_random_Forest_Classifiers(X_train,X_test , y_train , y_test):
#     clf = RandomForestClassifier(n_estimators= 100 , random_state = 0 , n_jobs = -1)
#     clf.fit(X_train , y_train)
#     y_pred = clf.predict(X_test)
#     print("Accuracy based on Random Forest Classifier on the testing set:")
#     print(accuracy_score(y_test , y_pred))

# %%time
# run_random_Forest_Classifiers(X_train , X_test, y_train , y_test)
# %%time
# run_random_Forest_Classifiers(X_train_filter , X_test_filter, y_train , y_test)
# %%time
# run_random_Forest_Classifiers(X_train_quasi_filter , X_test_quasi_filter, y_train , y_test)
# %%time
# run_random_Forest_Classifiers(X_train_unique , X_test_unique, y_train , y_test)

'\nThis is a regression problem NOT classification\n'