In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from matplotlib.colors import ListedColormap

# Load datasets
dataset = pd.read_csv('Social_Network_Ads.csv')
dataset1 = pd.read_csv('Social_Network_Adsnew.csv')
X = dataset.iloc[:, :-2].values
y = dataset.iloc[:, -1].values
print(X)
print(y)
# Display the shape of the dataset
print("Number of rows:", dataset1.shape[0])
print("Number of columns:", dataset1.shape[1])

# Check no. of missing values in every column for training dataframe
pd.DataFrame(dataset1.isnull().sum()).T.style.background_gradient(cmap='Spectral_r')

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

print('The train dataset contains {:,d} rows & {} columns'.format(X_train.shape[0], X_train.shape[1]))
print('The test dataset contains {:,d} rows & {} columns'.format(X_test.shape[0], X_test.shape[1]))

# Feature scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

print(X_train)
print(y_train)
print(X_test)
print(y_test)

# Train the RandomForest model
classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)

# Predict a single new result
print(classifier.predict(sc.transform([[30, 87000]])))

# Predict the test set results
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

# Evaluate the model
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Visualize the training set results
X_set, y_set = sc.inverse_transform(X_train), y_train
X1, X2 = np.meshgrid(np.arange(start=X_set[:, 0].min() - 10, stop=X_set[:, 0].max() + 10, step=1),
                     np.arange(start=X_set[:, 1].min() - 1000, stop=X_set[:, 1].max() + 1000, step=100))
plt.contourf(X1, X2, classifier.predict(sc.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha=0.75, cmap=ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c=ListedColormap(('red', 'green'))(i), label=j)
plt.title('Random Forest Classification (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

# Visualize the test set results
X_set, y_set = sc.inverse_transform(X_test), y_test
X1, X2 = np.meshgrid(np.arange(start=X_set[:, 0].min() - 10, stop=X_set[:, 0].max() + 10, step=1),
                     np.arange(start=X_set[:, 1].min() - 1000, stop=X_set[:, 1].max() + 1000, step=100))
plt.contourf(X1, X2, classifier.predict(sc.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha=0.75, cmap=ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c=ListedColormap(('red', 'green'))(i), label=j)
plt.title('Random Forest Classification (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()


[[19]
 [35]
 [26]
 [27]
 [19]
 [27]
 [27]
 [32]
 [25]
 [35]
 [26]
 [26]
 [20]
 [32]
 [18]
 [29]
 [47]
 [45]
 [46]
 [48]
 [45]
 [47]
 [48]
 [45]
 [46]
 [47]
 [49]
 [47]
 [29]
 [31]
 [31]
 [27]
 [21]
 [28]
 [27]
 [35]
 [33]
 [30]
 [26]
 [27]
 [27]
 [33]
 [35]
 [30]
 [28]
 [23]
 [25]
 [27]
 [30]
 [31]
 [24]
 [18]
 [29]
 [35]
 [27]
 [24]
 [23]
 [28]
 [22]
 [32]
 [27]
 [25]
 [23]
 [32]
 [59]
 [24]
 [24]
 [23]
 [22]
 [31]
 [25]
 [24]
 [20]
 [33]
 [32]
 [34]
 [18]
 [22]
 [28]
 [26]
 [30]
 [39]
 [20]
 [35]
 [30]
 [31]
 [24]
 [28]
 [26]
 [35]
 [22]
 [30]
 [26]
 [29]
 [29]
 [35]
 [35]
 [28]
 [35]
 [28]
 [27]
 [28]
 [32]
 [33]
 [19]
 [21]
 [26]
 [27]
 [26]
 [38]
 [39]
 [37]
 [38]
 [37]
 [42]
 [40]
 [35]
 [36]
 [40]
 [41]
 [36]
 [37]
 [40]
 [35]
 [41]
 [39]
 [42]
 [26]
 [30]
 [26]
 [31]
 [33]
 [30]
 [21]
 [28]
 [23]
 [20]
 [30]
 [28]
 [19]
 [19]
 [18]
 [35]
 [30]
 [34]
 [24]
 [27]
 [41]
 [29]
 [20]
 [26]
 [41]
 [31]
 [36]
 [40]
 [31]
 [46]
 [29]
 [26]
 [32]
 [32]
 [25]
 [37]
 [35]
 [33]
 [18]
 [22

ValueError: X has 2 features, but StandardScaler is expecting 1 features as input.

In [4]:
import pickle
pickle.dump(classifier,open("randonforest.pkl","wb"))

In [5]:
model_pk=pickle.load(open("randonforest.pkl","rb"))
model_pk.predict(sc.transform([[60,150000]]))

array([1], dtype=int64)