# Import Packages

In [1]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.feature_selection import SelectKBest, mutual_info_regression

from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/mercedes-benz-greener-manufacturing/train.csv.zip
/kaggle/input/mercedes-benz-greener-manufacturing/sample_submission.csv.zip
/kaggle/input/mercedes-benz-greener-manufacturing/test.csv.zip


# Import Data

Video Link: https://youtu.be/PpEDks6k88U

In [2]:
train = pd.read_csv('/kaggle/input/mercedes-benz-greener-manufacturing/train.csv.zip') 
test = pd.read_csv('/kaggle/input/mercedes-benz-greener-manufacturing/test.csv.zip') 

## Check Data

In [3]:
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


In [5]:
train.describe(include=['object'])

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
count,4209,4209,4209,4209,4209,4209,4209,4209
unique,47,27,44,7,4,29,12,25
top,z,aa,as,c,d,w,g,j
freq,360,833,1659,1942,4205,231,1042,277


In [6]:
train.describe(include=['float64'])

Unnamed: 0,y
count,4209.0
mean,100.669318
std,12.679381
min,72.11
25%,90.82
50%,99.15
75%,109.01
max,265.32


# Treat Target Column Outliers

In [None]:
train['y'].hist(bins=10)

In [None]:
train['y'].plot.box()

In [None]:
c=(y_trail[y_trail > 250])
c.count

In [None]:
Q1 = train['y'].quantile(0.25)
Q3 = train['y'].quantile(0.75)

IQR = Q3 - Q1

lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

print(lower_whisker)
print(upper_whisker)

In [None]:
outlier_index = train[(train.y > upper_whisker)].index
outlier_index

In [None]:
#outlier_count = train[(train.y > upper_whisker)]
#outlier_count.count()

## Drop Outliers from Train

In [None]:
#train = train.drop(outlier_index)

In [None]:
#train.info()

## Divide Data into X and y

In [7]:
y = train['y']
X = train.drop(['y','ID'], axis = 1)
X_test = test.drop(['ID'], axis = 1)

## Train & Validation Split

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2, random_state = 42)

## Get Categorical and Numerical Feature Names

In [9]:
numerical_features = X.select_dtypes(include = 'number').columns.values
numerical_features

array(['X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18',
       'X19', 'X20', 'X21', 'X22', 'X23', 'X24', 'X26', 'X27', 'X28',
       'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37',
       'X38', 'X39', 'X40', 'X41', 'X42', 'X43', 'X44', 'X45', 'X46',
       'X47', 'X48', 'X49', 'X50', 'X51', 'X52', 'X53', 'X54', 'X55',
       'X56', 'X57', 'X58', 'X59', 'X60', 'X61', 'X62', 'X63', 'X64',
       'X65', 'X66', 'X67', 'X68', 'X69', 'X70', 'X71', 'X73', 'X74',
       'X75', 'X76', 'X77', 'X78', 'X79', 'X80', 'X81', 'X82', 'X83',
       'X84', 'X85', 'X86', 'X87', 'X88', 'X89', 'X90', 'X91', 'X92',
       'X93', 'X94', 'X95', 'X96', 'X97', 'X98', 'X99', 'X100', 'X101',
       'X102', 'X103', 'X104', 'X105', 'X106', 'X107', 'X108', 'X109',
       'X110', 'X111', 'X112', 'X113', 'X114', 'X115', 'X116', 'X117',
       'X118', 'X119', 'X120', 'X122', 'X123', 'X124', 'X125', 'X126',
       'X127', 'X128', 'X129', 'X130', 'X131', 'X132', 'X133', 'X134',
       'X135',

In [10]:
categorical_features = X.select_dtypes(exclude = 'number').columns.values
categorical_features

array(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], dtype=object)

In [None]:
## Check if it works without .columns and .values
##categorical_t = X.select_dtypes(exclude = 'number').columns.values
##categorical_t

# Preprocessing

## Categorical Encoding

In [11]:
oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value= np.NaN)
oe.fit(X_train[categorical_features])

OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=nan)

In [12]:
# transform Train
X_train[categorical_features] = oe.transform(X_train[categorical_features])

# transform Val
X_val[categorical_features] = oe.transform(X_val[categorical_features])

# transform Test
X_test[categorical_features] = oe.transform(X_test[categorical_features])

## Missing Value treatment

In [13]:
impute = SimpleImputer(strategy = 'median')
impute.fit(X_train)

SimpleImputer(strategy='median')

In [14]:
# transform Train
X_train = impute.transform(X_train)

# transform Val
X_val = impute.transform(X_val)

# transform Test
X_test = impute.transform(X_test)

## Transformation

In [15]:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler()

In [16]:
# transform Train
X_train = scaler.transform(X_train)

# transform Val
X_val = scaler.transform(X_val)

# transform Test
X_test = scaler.transform(X_test)

## Select top Features

In [17]:
sel = SelectKBest(mutual_info_regression, k = 30)
sel.fit(X_train, y_train)

SelectKBest(k=30,
            score_func=<function mutual_info_regression at 0x7fab1ecedb00>)

In [18]:
# transform Train
X_train = sel.transform(X_train)

# transform Val
X_val = sel.transform(X_val)

# transform Test
X_test = sel.transform(X_test)

# Build Model

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
from sklearn import svm
svm_regr = svm.SVR()
svm_regr.fit(X_train, y_train)

In [20]:
from sklearn.linear_model import HuberRegressor
huber = HuberRegressor(max_iter=1000)
huber.fit(X_train, y_train)

HuberRegressor(max_iter=1000)

## Predict Y

In [21]:
y_train_pred = lr.predict(X_train)
y_val_pred = lr.predict(X_val)

y_test_pred = lr.predict(X_test)

NameError: name 'lr' is not defined

In [None]:
y_train_pred2 = svm_regr.predict(X_train)
y_val_pred2 = svm_regr.predict(X_val)

y_test_pred2 = svm_regr.predict(X_test)

In [22]:
y_train_pred3 = huber.predict(X_train)
y_val_pred3 = huber.predict(X_val)

y_test_pred3 = huber.predict(X_test)

# Check R Squared

In [None]:
print(r2_score(y_train, y_train_pred))
print(r2_score(y_val, y_val_pred))

In [None]:
print(r2_score(y_train, y_train_pred2))
print(r2_score(y_val, y_val_pred2))

In [23]:
print(r2_score(y_train, y_train_pred3))
print(r2_score(y_val, y_val_pred3))

0.527962294776988
0.5594374561618698


# Submission 

In [None]:
submission = pd.DataFrame({
    'ID' : test['ID'],
    'y' : y_test_pred
})

submission.head()

In [None]:
submission2 = pd.DataFrame({
    'ID' : test['ID'],
    'y' : y_test_pred2
})

submission2.head()

In [24]:
submission3 = pd.DataFrame({
    'ID' : test['ID'],
    'y' : y_test_pred3
})

submission3.head()

Unnamed: 0,ID,y
0,1,77.308016
1,2,92.680128
2,3,76.081865
3,4,77.252392
4,5,109.378601


## Export 

In [None]:
submission.to_csv('Submission_LR_F30.csv', index = False)

In [None]:
submission2.to_csv('Submission_Svm_F30.csv', index = False)

In [25]:
submission3.to_csv('Submission_huber_F30.csv', index = False)