<a href="https://colab.research.google.com/github/Naveen963/MachineLearning-Algorithms/blob/master/Randomforest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Building Machine Learning Classifiers:Building a basic Random Forest Model

In [0]:
import nltk
import pandas as pd
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download("stopwords")
nltk.download("wordnet")

data = pd.read_csv("SMSSpamCollection.tsv",sep="\t")
data.columns = ['labels','body_text']
stopwords = nltk.corpus.stopwords.words("english")
ps = nltk.PorterStemmer()

def count_punct(text):
  count = sum([1 for word in text if word in string.punctuation])
  return round(count/(len(text)-text.count(" ")),3)*100

data["body_len"] = data["body_text"].apply(lambda x:len(x)-x.count(" "))
data["punct%"] = data["body_text"].apply(lambda x:count_punct(x))

def clear_text(text):
  text="".join([word.lower() for word in text if word not in string.punctuation])
  tokens=re.split("\W+",text)
  text=[ps.stem(word) for word in tokens if word not in stopwords]
  return text
tfidf_vect=TfidfVectorizer(analyzer=clear_text)
X_tfidf=tfidf_vect.fit_transform(data["body_text"])
X_features=pd.concat([data['body_len'],data["punct%"],pd.DataFrame(X_tfidf.toarray())],axis=1)
X_features.head(50)

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
print(dir(RandomForestClassifier))
print(RandomForestClassifier())

In [0]:
from sklearn.model_selection import KFold,cross_val_score

In [0]:
rf=RandomForestClassifier(n_jobs=-1)
k_folds=KFold(n_splits=5)
cross_val_score(rf,X_features,data['labels'],cv=k_folds,scoring="accuracy",n_jobs=-1)

In [0]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [0]:
X_train,X_test,Y_train,Y_test=train_test_split(X_features,data['labels'],test_size=0.2)

In [0]:
RandomForestClassifier(n_estimators=50,max_depth=20,n_jobs=-1)
rf_model=rf.fit(X_train,Y_train)

In [0]:
sorted(zip(rf_model.feature_importances_,X_train.columns),reverse=True)[0:10]

[(0.0457915709412991, 'body_len'),
 (0.028792041563502156, 1803),
 (0.02596507353876682, 7350),
 (0.025300378144450436, 2031),
 (0.024818104983428672, 4796),
 (0.02122927466553095, 3134),
 (0.01576177259580697, 6285),
 (0.013961375272507319, 5724),
 (0.012263525331041199, 7027),
 (0.011588022753052055, 5988)]

In [0]:
y_pred=rf_model.predict(X_test)
precision,recall,fscore,support=score(Y_test,y_pred,pos_label="spam",average="binary")
print("Precision : {}\n Recall : {}\n Accuracy:{}".format(round(precision,3),round(recall,3),(y_pred==Y_test).sum()/len(y_pred)))

Precision : 1.0
 Recall : 0.949
 Accuracy:0.992818671454219


BUild our own Grid Search


In [0]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score

In [0]:
X_train,X_test,Y_train,Y_test=train_test_split(X_features,data['labels'],test_size=0.2)

In [0]:
def train_RF(n_est,depth):
  rf=RandomForestClassifier(n_estimators=n_est,max_depth=depth,n_jobs=-1)
  rf_model=rf.fit(X_train,Y_train)
  y_pred=rf_model.predict(X_test)
  precision,recall,fscore,support=score(Y_test,y_pred,pos_label="spam",average="binary")
  print("Est:{} \n Depth:{}\n Precision:{}\n Recall:{}\n Accuracy:{}".format(n_est,depth,round(precision,3),round(recall,3),round((y_pred==Y_test).sum()/len(y_pred),3)))

In [0]:
for n_est in [10,50,100]:
  for depth in [10,20,30,None]:
    train_RF(n_est,depth)

Est:10 
 Depth:10
 Precision:1.0
 Recall:0.227
 Accuracy:0.887
Est:10 
 Depth:20
 Precision:1.0
 Recall:0.589
 Accuracy:0.94
Est:10 
 Depth:30
 Precision:1.0
 Recall:0.663
 Accuracy:0.951
Est:10 
 Depth:None
 Precision:1.0
 Recall:0.785
 Accuracy:0.969
Est:50 
 Depth:10
 Precision:1.0
 Recall:0.227
 Accuracy:0.887
Est:50 
 Depth:20
 Precision:1.0
 Recall:0.632
 Accuracy:0.946
Est:50 
 Depth:30
 Precision:1.0
 Recall:0.675
 Accuracy:0.952
Est:50 
 Depth:None
 Precision:1.0
 Recall:0.822
 Accuracy:0.974
Est:100 
 Depth:10
 Precision:1.0
 Recall:0.178
 Accuracy:0.88
Est:100 
 Depth:20
 Precision:1.0
 Recall:0.601
 Accuracy:0.942
Est:100 
 Depth:30
 Precision:1.0
 Recall:0.706
 Accuracy:0.957
Est:100 
 Depth:None
 Precision:1.0
 Recall:0.798
 Accuracy:0.97
