# Using cuDF and cuML (GPU)

In [1]:
import seaborn as sea
import cudf
import numpy as np

titanic = sea.load_dataset("titanic")

In [2]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
titanic = cudf.DataFrame.from_pandas(titanic)

In [4]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [5]:
titanic.isnull().any(axis=0)

survived       False
pclass         False
sex            False
age             True
sibsp          False
parch          False
fare           False
embarked        True
class          False
who            False
adult_male     False
deck            True
embark_town     True
alive          False
alone          False
dtype: bool

In [6]:
titanic = titanic.drop(columns = ['deck'])

In [7]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


In [8]:
from cuml.ensemble import RandomForestRegressor

titanicWithAge = titanic[titanic['age'].isnull() == False]
titanicWithoutAge = titanic[titanic['age'].isnull() == True]

In [9]:
titanicWithAge

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,Queenstown,no,False
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


In [10]:
titanicWithoutAge

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
5,0,3,male,,0,0,8.4583,Q,Third,man,True,Queenstown,no,True
17,1,2,male,,0,0,13.0000,S,Second,man,True,Southampton,yes,True
19,1,3,female,,0,0,7.2250,C,Third,woman,False,Cherbourg,yes,True
26,0,3,male,,0,0,7.2250,C,Third,man,True,Cherbourg,no,True
28,1,3,female,,0,0,7.8792,Q,Third,woman,False,Queenstown,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,0,3,male,,0,0,7.2292,C,Third,man,True,Cherbourg,no,True
863,0,3,female,,8,2,69.5500,S,Third,woman,False,Southampton,no,False
868,0,3,male,,0,0,9.5000,S,Third,man,True,Southampton,no,True
878,0,3,male,,0,0,7.8958,S,Third,man,True,Southampton,no,True


In [11]:
variables = ['pclass', 'sibsp', 'parch', 'fare', 'age']

In [12]:
one_hot_encoded_embarked = cudf.get_dummies(titanicWithAge['embarked'], dummy_na = False)
one_hot_encoded_sex = cudf.get_dummies(titanicWithAge['sex'], dummy_na = False)
titanicWithAge = titanicWithAge[variables]
titanicWithAge = cudf.concat([titanicWithAge, one_hot_encoded_sex, one_hot_encoded_embarked], axis = 1)

one_hot_encoded_embarked = cudf.get_dummies(titanicWithoutAge['embarked'], dummy_na = False)
one_hot_encoded_sex = cudf.get_dummies(titanicWithoutAge['sex'], dummy_na = False)
titanicWithoutAge = titanicWithoutAge[variables]
titanicWithoutAge = cudf.concat([titanicWithoutAge, one_hot_encoded_sex, one_hot_encoded_embarked], axis = 1)

In [13]:
titanicWithAge

Unnamed: 0,pclass,sibsp,parch,fare,age,female,male,C,Q,S
0,3,1,0,7.2500,22.0,0,1,0,0,1
1,1,1,0,71.2833,38.0,1,0,1,0,0
2,3,0,0,7.9250,26.0,1,0,0,0,1
3,1,1,0,53.1000,35.0,1,0,0,0,1
4,3,0,0,8.0500,35.0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
885,3,0,5,29.1250,39.0,1,0,0,1,0
886,2,0,0,13.0000,27.0,0,1,0,0,1
887,1,0,0,30.0000,19.0,1,0,0,0,1
889,1,0,0,30.0000,26.0,0,1,1,0,0


In [14]:
titanicWithoutAge

Unnamed: 0,pclass,sibsp,parch,fare,age,female,male,C,Q,S
5,3,0,0,8.4583,,0,1,0,1,0
17,2,0,0,13.0000,,0,1,0,0,1
19,3,0,0,7.2250,,1,0,1,0,0
26,3,0,0,7.2250,,0,1,1,0,0
28,3,0,0,7.8792,,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
859,3,0,0,7.2292,,0,1,1,0,0
863,3,8,2,69.5500,,1,0,0,0,1
868,3,0,0,9.5000,,0,1,0,0,1
878,3,0,0,7.8958,,0,1,0,0,1


In [15]:
independentVariables = ['pclass', 'female', 'male', 'sibsp', 'parch', 'fare', 'C', 'Q', 'S']

rfModel_age = RandomForestRegressor()
#titanicWithAge['age'] = titanicWithAge['age'].fillna(titanicWithAge['age'].mean()).astype('int')
rfModel_age.fit(titanicWithAge[independentVariables], titanicWithAge['age'])

generatedAgeValues = rfModel_age.predict(X = titanicWithoutAge[independentVariables])

  ret_val = func(*args, **kwargs)


In [16]:
titanicWithAge

Unnamed: 0,pclass,sibsp,parch,fare,age,female,male,C,Q,S
0,3,1,0,7.2500,22.0,0,1,0,0,1
1,1,1,0,71.2833,38.0,1,0,1,0,0
2,3,0,0,7.9250,26.0,1,0,0,0,1
3,1,1,0,53.1000,35.0,1,0,0,0,1
4,3,0,0,8.0500,35.0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
885,3,0,5,29.1250,39.0,1,0,0,1,0
886,2,0,0,13.0000,27.0,0,1,0,0,1
887,1,0,0,30.0000,19.0,1,0,0,0,1
889,1,0,0,30.0000,26.0,0,1,1,0,0


In [17]:
titanicWithoutAge['age'] = generatedAgeValues.astype(int)
data = titanicWithAge.append(titanicWithoutAge)



In [18]:
data.reset_index(inplace = True)
data.drop('index', inplace=True, axis=1)

In [19]:
data

Unnamed: 0,pclass,sibsp,parch,fare,age,female,male,C,Q,S
0,3,1,0,7.2500,22.0,0,1,0,0,1
1,1,1,0,71.2833,38.0,1,0,1,0,0
2,3,0,0,7.9250,26.0,1,0,0,0,1
3,1,1,0,53.1000,35.0,1,0,0,0,1
4,3,0,0,8.0500,35.0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,3,0,0,7.2292,25.0,0,1,1,0,0
887,3,8,2,69.5500,15.0,1,0,0,0,1
888,3,0,0,9.5000,26.0,0,1,0,0,1
889,3,0,0,7.8958,27.0,0,1,0,0,1


In [20]:
age_MSE_pre = (data.age - titanicWithAge.age)
age_MSE_pre = age_MSE_pre.dropna()

MSE = 0
for each in age_MSE_pre.values:
    MSE += each**2

print("The MSE of age prediction via RFA: ", MSE/len(age_MSE_pre))

The MSE of age prediction via RFA:  411.8062746498599


---
# Using Ordinary Pandas (CPU)
---

In [21]:
import seaborn as sea
import pandas as pd
import numpy as np

titanic = sea.load_dataset("titanic")

In [22]:
titanic.isnull().any(axis=0)

survived       False
pclass         False
sex            False
age             True
sibsp          False
parch          False
fare           False
embarked        True
class          False
who            False
adult_male     False
deck            True
embark_town     True
alive          False
alone          False
dtype: bool

In [23]:
titanic = titanic.drop(columns = ['deck'])

In [24]:
from sklearn.ensemble import RandomForestRegressor

titanicWithAge = titanic[pd.isnull(titanic['age']) == False]
titanicWithoutAge = titanic[pd.isnull(titanic['age'])]

In [25]:
variables = ['pclass', 'sibsp', 'parch', 'fare', 'age']

In [26]:
one_hot_encoded_embarked = pd.get_dummies(titanicWithAge['embarked'])
one_hot_encoded_sex = pd.get_dummies(titanicWithAge['sex'])
titanicWithAge = titanicWithAge[variables]
titanicWithAge = pd.concat([titanicWithAge, one_hot_encoded_sex, one_hot_encoded_embarked], axis = 1)

one_hot_encoded_embarked = pd.get_dummies(titanicWithoutAge['embarked'])
one_hot_encoded_sex = pd.get_dummies(titanicWithoutAge['sex'])
titanicWithoutAge = titanicWithoutAge[variables]
titanicWithoutAge = pd.concat([titanicWithoutAge, one_hot_encoded_sex, one_hot_encoded_embarked], axis = 1)

In [27]:
titanicWithAge

Unnamed: 0,pclass,sibsp,parch,fare,age,female,male,C,Q,S
0,3,1,0,7.2500,22.0,0,1,0,0,1
1,1,1,0,71.2833,38.0,1,0,1,0,0
2,3,0,0,7.9250,26.0,1,0,0,0,1
3,1,1,0,53.1000,35.0,1,0,0,0,1
4,3,0,0,8.0500,35.0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
885,3,0,5,29.1250,39.0,1,0,0,1,0
886,2,0,0,13.0000,27.0,0,1,0,0,1
887,1,0,0,30.0000,19.0,1,0,0,0,1
889,1,0,0,30.0000,26.0,0,1,1,0,0


In [28]:
titanicWithoutAge

Unnamed: 0,pclass,sibsp,parch,fare,age,female,male,C,Q,S
5,3,0,0,8.4583,,0,1,0,1,0
17,2,0,0,13.0000,,0,1,0,0,1
19,3,0,0,7.2250,,1,0,1,0,0
26,3,0,0,7.2250,,0,1,1,0,0
28,3,0,0,7.8792,,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
859,3,0,0,7.2292,,0,1,1,0,0
863,3,8,2,69.5500,,1,0,0,0,1
868,3,0,0,9.5000,,0,1,0,0,1
878,3,0,0,7.8958,,0,1,0,0,1


In [29]:
independentVariables = ['pclass', 'female', 'male', 'sibsp', 'parch', 'fare', 'C', 'Q', 'S']

rfModel_age = RandomForestRegressor()
rfModel_age.fit(titanicWithAge[independentVariables], titanicWithAge['age'])

generatedAgeValues = rfModel_age.predict(X = titanicWithoutAge[independentVariables])

In [30]:
titanicWithoutAge['age'] = generatedAgeValues.astype(int)
data = titanicWithAge.append(titanicWithoutAge)

  data = titanicWithAge.append(titanicWithoutAge)


In [31]:
data.reset_index(inplace=True)
data.drop('index',inplace=True,axis=1)

In [32]:
data

Unnamed: 0,pclass,sibsp,parch,fare,age,female,male,C,Q,S
0,3,1,0,7.2500,22.0,0,1,0,0,1
1,1,1,0,71.2833,38.0,1,0,1,0,0
2,3,0,0,7.9250,26.0,1,0,0,0,1
3,1,1,0,53.1000,35.0,1,0,0,0,1
4,3,0,0,8.0500,35.0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,3,0,0,7.2292,25.0,0,1,1,0,0
887,3,8,2,69.5500,14.0,1,0,0,0,1
888,3,0,0,9.5000,25.0,0,1,0,0,1
889,3,0,0,7.8958,27.0,0,1,0,0,1


In [33]:
age_MSE_pre = (data.age - titanicWithAge.age)

In [34]:
age_MSE_pre = age_MSE_pre.dropna()

In [35]:
MSE = 0
for each in age_MSE_pre:
    MSE += each**2

print("The MSE of age prediction via RFA: ", MSE/len(age_MSE_pre))

The MSE of age prediction via RFA:  413.89543431372545
