## Machine Learning ProcessSteps in Data Preprocessing

#### Step 1 : Import the libraries
#### Step 2 : Import the data-set
#### Step 3 : Check out the missing values
#### Step 4 : See the Categorical Values
#### Step 5 : Splitting the data-set into Training and Test Set
#### Step 6 : Feature Scaling

# Step 1 : Import the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Step 2 : Import the data-set

In [2]:
df_train = pd.read_csv('train.csv', parse_dates=['timestamp'])
df_test = pd.read_csv('test.csv', parse_dates=['timestamp'])
df_marco = pd.read_csv('macro.csv', parse_dates=['timestamp'])
y_test = pd.read_csv('sample_submission.csv')

In [3]:
print('Shape of df',df_train.shape)
print('Shape of df',df_test.shape)
print('Shape of df',df_marco.shape)

Shape of df (30471, 292)
Shape of df (7662, 291)
Shape of df (2484, 100)


In [4]:
id_test = df_test.id
y = df_train['price_doc']
ylog1p_train = np.log1p(df_train['price_doc'].values)
df_train = df_train.drop(["price_doc"], axis=1)

df_train["trainOrTest"] = "train"
df_test["trainOrTest"] = "test"
df_all = pd.concat([df_train, df_test])
df_all = df_all.drop("id", axis=1)

In [5]:
df_all.head()

Unnamed: 0,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,trainOrTest
0,2011-08-20,43.0,27.0,4.0,,,,,,,...,9,4,0,13,22,1,0,52,4,train
1,2011-08-23,34.0,19.0,3.0,,,,,,,...,15,3,0,15,29,1,10,66,14,train
2,2011-08-27,43.0,29.0,2.0,,,,,,,...,10,3,0,11,27,0,4,67,10,train
3,2011-09-01,89.0,50.0,9.0,,,,,,,...,11,2,1,4,4,0,0,26,3,train
4,2011-09-05,77.0,77.0,4.0,,,,,,,...,319,108,17,135,236,2,91,195,14,train


In [6]:
# Convert the date into a number (of days since some point)
fromDate = min(df_all['timestamp'])
print(fromDate)
df_all['timedelta'] = (df_all['timestamp'] - fromDate).dt.days.astype(int)
print(df_all[['timestamp', 'timedelta']].head())
df_all.drop('timestamp', axis = 1, inplace = True)

2011-08-20 00:00:00
   timestamp  timedelta
0 2011-08-20          0
1 2011-08-23          3
2 2011-08-27          7
3 2011-09-01         12
4 2011-09-05         16


# Step 3 : Check out the missing value

In [7]:
df_all.isnull().sum()

full_sq                  0
life_sq               7559
floor                  167
max_floor             9572
material              9572
                      ... 
leisure_count_5000       0
sport_count_5000         0
market_count_5000        0
trainOrTest              0
timedelta                0
Length: 291, dtype: int64

In [8]:
from sklearn.impute import SimpleImputer

##### Numerical missing Values Imputation

In [9]:
num_var = df_all.select_dtypes(include = ['int64','float64']).columns.to_list()

In [10]:
df_all[num_var].isnull().sum()

full_sq                  0
life_sq               7559
floor                  167
max_floor             9572
material              9572
                      ... 
church_count_5000        0
mosque_count_5000        0
leisure_count_5000       0
sport_count_5000         0
market_count_5000        0
Length: 274, dtype: int64

In [11]:
imputer_median = SimpleImputer(strategy="median")
imputer_median.fit(df_all[num_var])

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

In [12]:
imputer_median.statistics_

array([5.00000000e+01, 3.00000000e+01, 7.00000000e+00, 1.20000000e+01,
       1.00000000e+00, 1.98000000e+03, 2.00000000e+00, 6.00000000e+00,
       2.00000000e+00, 1.02072151e+07, 8.38440000e+04, 1.67525670e-01,
       7.21575810e-02, 4.92600000e+03, 2.88100000e+03, 4.00000000e+00,
       5.28500000e+03, 7.32700000e+03, 5.00000000e+00, 0.00000000e+00,
       9.90000000e+02, 1.00000000e+00, 0.00000000e+00, 5.00000000e+00,
       2.00000000e+00, 0.00000000e+00, 3.00000000e+00, 2.00000000e+00,
       8.50830000e+04, 3.92270000e+04, 4.54100000e+04, 1.09880000e+04,
       5.47000000e+03, 5.34700000e+03, 5.25600000e+04, 2.63820000e+04,
       2.63330000e+04, 2.01840000e+04, 6.18000000e+03, 1.35400000e+04,
       4.92600000e+03, 2.54900000e+03, 2.39000000e+03, 5.28500000e+03,
       2.69300000e+03, 2.59200000e+03, 1.25080000e+04, 6.09600000e+03,
       6.32100000e+03, 1.76620000e+04, 8.89600000e+03, 9.17400000e+03,
       9.63300000e+03, 4.83500000e+03, 4.70200000e+03, 2.82000000e+02,
      

In [13]:
imputer_median.transform(df_all[num_var])

array([[ 43.  ,  27.  ,   4.  , ...,   0.  ,  52.  ,   4.  ],
       [ 34.  ,  19.  ,   3.  , ...,  10.  ,  66.  ,  14.  ],
       [ 43.  ,  29.  ,   2.  , ...,   4.  ,  67.  ,  10.  ],
       ...,
       [ 41.08,   1.  ,  12.  , ..., 105.  , 203.  ,  13.  ],
       [ 34.8 ,  19.8 ,   8.  , ...,   2.  ,  43.  ,  10.  ],
       [ 63.  ,  43.8 ,   5.  , ...,   4.  ,  42.  ,  11.  ]])

In [14]:
df_all[num_var] = imputer_median.transform(df_all[num_var])

In [15]:
df_all[num_var].isnull().sum().sum()

0

##### Categorical missing Value Imputations 

In [16]:
cat_var = df_all.select_dtypes(include = ['O']).columns.to_list()
cat_var

['product_type',
 'sub_area',
 'culture_objects_top_25',
 'thermal_power_plant_raion',
 'incineration_raion',
 'oil_chemistry_raion',
 'radiation_raion',
 'railroad_terminal_raion',
 'big_market_raion',
 'nuclear_reactor_raion',
 'detention_facility_raion',
 'water_1line',
 'big_road1_1line',
 'railroad_1line',
 'ecology',
 'trainOrTest']

In [17]:
df_all[cat_var].isnull().sum()

product_type                 33
sub_area                      0
culture_objects_top_25        0
thermal_power_plant_raion     0
incineration_raion            0
oil_chemistry_raion           0
radiation_raion               0
railroad_terminal_raion       0
big_market_raion              0
nuclear_reactor_raion         0
detention_facility_raion      0
water_1line                   0
big_road1_1line               0
railroad_1line                0
ecology                       0
trainOrTest                   0
dtype: int64

In [18]:
imputer_mode = SimpleImputer(strategy="most_frequent")
imputer_mode.fit(df_all[cat_var])

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='most_frequent', verbose=0)

In [19]:
imputer_mode.statistics_

array(['Investment', 'Poselenie Sosenskoe', 'no', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no', 'no', 'no', 'no', 'poor', 'train'],
      dtype=object)

In [20]:
imputer_mode.transform(df_all[cat_var])

array([['Investment', 'Bibirevo', 'no', ..., 'no', 'good', 'train'],
       ['Investment', 'Nagatinskij Zaton', 'yes', ..., 'no', 'excellent',
        'train'],
       ['Investment', "Tekstil'shhiki", 'no', ..., 'no', 'poor', 'train'],
       ...,
       ['OwnerOccupier', 'Tverskoe', 'yes', ..., 'no', 'excellent',
        'test'],
       ['Investment', 'Orehovo-Borisovo Juzhnoe', 'no', ..., 'no',
        'poor', 'test'],
       ['Investment', 'Chertanovo Severnoe', 'no', ..., 'no', 'poor',
        'test']], dtype=object)

In [21]:
df_all[cat_var] = imputer_mode.transform(df_all[cat_var])

In [22]:
df_all.isnull().sum().sum()

0

# Step 4 : See the Categorical Values

In [23]:
cat_var

['product_type',
 'sub_area',
 'culture_objects_top_25',
 'thermal_power_plant_raion',
 'incineration_raion',
 'oil_chemistry_raion',
 'radiation_raion',
 'railroad_terminal_raion',
 'big_market_raion',
 'nuclear_reactor_raion',
 'detention_facility_raion',
 'water_1line',
 'big_road1_1line',
 'railroad_1line',
 'ecology',
 'trainOrTest']

In [24]:
 df_all2 = pd.get_dummies(df_all,drop_first=True )

In [25]:
# Prepare separate train and test datasets
idx_train = df_all2['trainOrTest_train'] == 1
idx_test = df_all2['trainOrTest_train'] == 0

x_train = df_all2[idx_train]
x_test = df_all2[idx_test]
x_test = x_test[topest]

(7662, 438)

In [26]:
x_train.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,area_m,...,nuclear_reactor_raion_yes,detention_facility_raion_yes,water_1line_yes,big_road1_1line_yes,railroad_1line_yes,ecology_good,ecology_no data,ecology_poor,ecology_satisfactory,trainOrTest_train
0,43.0,27.0,4.0,12.0,1.0,1980.0,2.0,6.0,2.0,6407578.0,...,0,0,0,0,0,1,0,0,0,1
1,34.0,19.0,3.0,12.0,1.0,1980.0,2.0,6.0,2.0,9589337.0,...,0,0,0,0,0,0,0,0,0,1
2,43.0,29.0,2.0,12.0,1.0,1980.0,2.0,6.0,2.0,4808270.0,...,0,0,0,0,0,0,0,1,0,1
3,89.0,50.0,9.0,12.0,1.0,1980.0,2.0,6.0,2.0,12583540.0,...,0,0,0,0,0,1,0,0,0,1
4,77.0,77.0,4.0,12.0,1.0,1980.0,2.0,6.0,2.0,8398461.0,...,0,0,0,0,1,0,0,0,0,1


In [27]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [36]:
bestfeatures = SelectKBest(score_func=chi2,k = 40)
fit = bestfeatures.fit(x_train,y)

In [38]:
df_scores = pd.DataFrame(fit.scores_)
df_column = pd.DataFrame(x_train.columns)
featurescores = pd.concat([df_scores,df_column],axis = 1)
featurescores.columns = ['Score','feature']

In [39]:
featurescores

Unnamed: 0,Score,feature
0,4.757226e+05,full_sq
1,1.872955e+06,life_sq
2,5.670525e+04,floor
3,4.474310e+04,max_floor
4,1.286022e+04,material
...,...,...
433,8.577024e+03,ecology_good
434,1.836520e+04,ecology_no data
435,6.381795e+03,ecology_poor
436,7.375808e+03,ecology_satisfactory


In [55]:
top = featurescores.nlargest(40,'Score')

In [57]:
topest = top['feature'].to_list()

['area_m',
 'office_sqm_5000',
 'office_sqm_3000',
 'office_sqm_2000',
 'trc_sqm_5000',
 'office_sqm_1500',
 'trc_sqm_3000',
 'trc_sqm_1000',
 'trc_sqm_500',
 'trc_sqm_2000',
 'trc_sqm_1500',
 'office_sqm_1000',
 'full_all',
 'female_f',
 'male_f',
 'office_sqm_500',
 'build_year',
 'raion_popul',
 '16_29_all',
 'work_all',
 '16_29_female',
 '16_29_male',
 'work_male',
 'work_female',
 'ekder_all',
 'ekder_female',
 '0_17_all',
 'young_all',
 '0_13_all',
 '0_17_male',
 '0_17_female',
 'ekder_male',
 'young_male',
 'young_female',
 'children_school',
 '7_14_all',
 'children_preschool',
 '0_6_all',
 '0_13_male',
 '0_13_female']

# Step 6 : Feature Scaling

I am going to use Ranom Forest Regresser. When we use Random forest Regresser, Then we did not need any Features Scaling.

# Model training

In [59]:
x_train = x_train[topest]

In [73]:
# Step 1: Instantiate a random forest regressor
Model = RandomForestRegressor(n_estimators = 100, 
                              random_state = 2017, 
                              oob_score = True, 
                              max_features = 20,
                              min_samples_leaf = 8)

In [74]:
Model.fit(x_train, y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features=20, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=8,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=True,
                      random_state=2017, verbose=0, warm_start=False)

In [75]:
print('shape of test x',x_test.shape)
print('shape of test y',y_test.shape)

shape of test x (7662, 40)
shape of test y (7662, 1)


In [71]:
y_test.drop('id',axis = 1,inplace = True)

In [64]:
x_test = x_test[topest]

In [72]:
Model.score(x_test,y_test)

-1.2148978075449917e+30