In [1]:
#Social_Network_Ads.csv

**Importing the libraries**

In [2]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split

**Importing the dataset**

In [3]:
original_features = pd.read_csv('Social_Network_Ads.csv') #
print(original_features.columns)
original_features

Index(['User ID', 'Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')


Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [4]:
original_features.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [5]:
original_features = original_features.drop_duplicates()

In [6]:
original_features.dtypes

User ID             int64
Gender             object
Age                 int64
EstimatedSalary     int64
Purchased           int64
dtype: object

In [7]:
original_features.Gender = original_features.Gender.map({'Male':1,'Female':2})

**Splitting the dataset into the Training set and Test set**

In [8]:
original_labels = np.array(original_features['Purchased'])

In [9]:
original_features= original_features.drop('Purchased', axis = 1)


In [10]:
original_feature_list = list(original_features.columns)

In [11]:
original_train_features, original_test_features,original_train_labels, original_test_labels = train_test_split(original_features, original_labels, test_size = 0.25, random_state = 42)

In [12]:
original_features.shape

(400, 4)

**Feature Scaling**

In [13]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler() 
scaler.fit(original_train_features) 
original_train_features_scaled = scaler.transform(original_train_features) 
original_test_features_scaled = scaler.transform(original_test_features)

**Fitting Random Forest to the Training set**

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [15]:
rf = RandomForestRegressor(n_estimators= 100, max_depth = 3, max_features='sqrt')

In [16]:
rf.fit(original_train_features, original_train_labels)

RandomForestRegressor(max_depth=3, max_features='sqrt')

In [17]:
predictions = rf.predict(original_test_features)

In [18]:
r2_score(predictions, original_test_labels)

0.46366298026973973

In [19]:
original_train_features.shape

(300, 4)

In [None]:
**Feature Importance**

In [30]:
importances = list(rf.feature_importances_)

feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(original_feature_list, importances)]

feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Age                  Importance: 0.5
Variable: EstimatedSalary      Importance: 0.45
Variable: User ID              Importance: 0.05
Variable: Gender               Importance: 0.0


In [32]:
# New random forest with only the two most important variables
rf_most_important = RandomForestRegressor(n_estimators= 100, max_depth = 2, max_features='auto')

important_indices = ['Age','EstimatedSalary']
train_important = original_train_features.loc[:, important_indices]
test_important = original_test_features.loc[:, important_indices]

rf_most_important.fit(train_important, original_train_labels)

predictions = rf_most_important.predict(test_important)

r2_score(predictions, original_test_labels)

0.5333636539304397