# Predicing Year of Marriage 

- Preparing for model deployment
-  notebook is taken as prototype

# 1- Importing key modules

In [1]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function
#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import pprint
%matplotlib inline

# 2- Loading and Preparing data

In [3]:
data = pd.read_csv('age_of_marriage_data.csv')
data.shape

(2567, 10)

In [4]:
data.head()

Unnamed: 0,id,gender,height,religion,caste,mother_tongue,profession,location,country,age_of_marriage
0,1,female,"5'4""",,others,Telugu,,London,United Kingdom,21.0
1,2,male,"5'7""",Jain,Shwetamber,Gujarati,Doctor / Healthcare Professional,Fairfax- VA,USA,32.0
2,3,male,"5'7""",Hindu,Brahmin,Hindi,Entrepreneurs / Business,Begusarai,India,32.0
3,4,female,"5'0""",Hindu,Thakur,Hindi,Architect,Mumbai,India,30.0
4,5,male,"5'5""",Christian,Born Again,Malayalam,Sales Professional / Marketing,Sulthan Bathery,India,30.0


### 2.1.Checking missing values

In [5]:
data.isnull().sum()

id                   0
gender              29
height             118
religion           635
caste              142
mother_tongue      164
profession         330
location           155
country             16
age_of_marriage     19
dtype: int64

In [6]:
# fixing missing data
(data.shape[0] - data.dropna().shape[0])/data.shape[0]

0.24737047136735488

In [7]:
data.dropna(inplace=True)

In [8]:
data.shape

(1932, 10)

In [9]:
data.head(2)

Unnamed: 0,id,gender,height,religion,caste,mother_tongue,profession,location,country,age_of_marriage
1,2,male,"5'7""",Jain,Shwetamber,Gujarati,Doctor / Healthcare Professional,Fairfax- VA,USA,32.0
2,3,male,"5'7""",Hindu,Brahmin,Hindi,Entrepreneurs / Business,Begusarai,India,32.0


In [10]:
data.profession.nunique()

84

### 2.2.Defining dependant and independant variable

We can use all variables. But , just to use most prominent variables

In [11]:
X = data.loc[:,['gender','height','religion','caste','mother_tongue','country']]
y = data.age_of_marriage

In [12]:
X.head()

Unnamed: 0,gender,height,religion,caste,mother_tongue,country
1,male,"5'7""",Jain,Shwetamber,Gujarati,USA
2,male,"5'7""",Hindu,Brahmin,Hindi,India
3,female,"5'0""",Hindu,Thakur,Hindi,India
4,male,"5'5""",Christian,Born Again,Malayalam,India
5,male,"5'5""",Hindu,Valmiki,Hindi,India


In [13]:
print(X.shape)
print(y.shape)

(1932, 6)
(1932,)


### 2.3.Encoding categorical variable

In [14]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
X.loc[:,['gender','religion','caste','mother_tongue','country']]= \
X.loc[:,['gender','religion','caste','mother_tongue','country']].apply(enc.fit_transform)

In [15]:
X.head()

Unnamed: 0,gender,height,religion,caste,mother_tongue,country
1,1,"5'7""",2,34,6,19
2,1,"5'7""",1,14,8,5
3,0,"5'0""",1,36,8,5
4,1,"5'5""",0,13,13,5
5,1,"5'5""",1,38,8,5


In [16]:
int(X.loc[1,'height'].split('\'')[0])*30.48

152.4

In [17]:
int(X.loc[1,'height'].split('\'')[1].replace('"',''))*2.54

17.78

In [18]:
def h_cms(h):
    return int(h.split('\'')[0])*30.48+\
    int(h.split('\'')[1].replace('"',''))*2.54

In [19]:
X['height_cms'] = X.height.apply(h_cms)

In [20]:
X.head()

Unnamed: 0,gender,height,religion,caste,mother_tongue,country,height_cms
1,1,"5'7""",2,34,6,19,170.18
2,1,"5'7""",1,14,8,5,170.18
3,0,"5'0""",1,36,8,5,152.4
4,1,"5'5""",0,13,13,5,165.1
5,1,"5'5""",1,38,8,5,165.1


In [21]:
X.drop('height',inplace=True,axis=1)

In [22]:
X.head()

Unnamed: 0,gender,religion,caste,mother_tongue,country,height_cms
1,1,2,34,6,19,170.18
2,1,1,14,8,5,170.18
3,0,1,36,8,5,152.4
4,1,0,13,13,5,165.1
5,1,1,38,8,5,165.1


# 3- Model Building

### 3.1.Train-test Split

In [23]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [24]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1545, 6)
(387, 6)
(1545,)
(387,)


### 3.2.Model training

In [25]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=80,max_depth=11)
model.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=11,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=80, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [26]:
# predict
y_predict = model.predict(X_test)
y_predict[:5]

array([28.5871571 , 33.7802169 , 29.85410153, 28.53623373, 30.28293428])

# 4-Post Model

### 4.1.Performance

In [27]:
from sklearn.metrics import mean_absolute_error, r2_score
print("MAE : ", mean_absolute_error(y_test,y_predict))
print("R-squared :", r2_score(y_test,y_predict))

MAE :  1.0229876486990297
R-squared : 0.7024032187182716


### 4.2.Prediction for out of sample data

In [28]:
oos_data=[[1,2,34,6,19,170.18]]
oos_data

[[1, 2, 34, 6, 19, 170.18]]

In [29]:
print("Predicted marriage age: ",model.predict(oos_data))

Predicted marriage age:  [32.68619981]


# 4.3.Serialize model

In [30]:
from sklearn.externals import joblib
joblib.dump(model,'marriage_age_predict_model.ml')

['marriage_age_predict_model.ml']

we can use **pickle** as well

In [31]:
import pickle
pickle_out = open("marriage_age_predict_model.pkl","wb")
pickle.dump(model, pickle_out)
pickle_out.close()