# Random Forest

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from graphviz import Source
from sklearn import tree
from IPython.display import SVG
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report,confusion_matrix


In [2]:
df = pd.read_csv("processed_data.csv")

In [3]:
# Concatenate the list of words into a single string
df['L_Content'] = df['L_Content'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
# Convert the 'Content' column to lowercase
df['L_Content'] = df['L_Content'].str.lower()

In [4]:
cv = CountVectorizer(ngram_range=(1,2), min_df=10)
vectorized_content = cv.fit_transform(df['L_Content'])

In [5]:
# Split data into test and training dataset
X=vectorized_content
# x=x.merge()
y=df['Source'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
treeclf = DecisionTreeClassifier(max_depth=4)
treeclf.fit(X_train, y_train)
graph = Source(tree.export_graphviz(treeclf, out_file=None))
                                    # feature_names=feature_cols,
                                    # class_names=['Y', 'C', 'X', 'A', 'B'], filled = True))

svg = SVG(graph.pipe(format='svg'))
# display(svg)
y_pred = treeclf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[103  36]
 [ 12 127]]
              precision    recall  f1-score   support

           0       0.90      0.74      0.81       139
           1       0.78      0.91      0.84       139

    accuracy                           0.83       278
   macro avg       0.84      0.83      0.83       278
weighted avg       0.84      0.83      0.83       278



In [7]:
rf = RandomForestRegressor(n_estimators = 1000, max_depth=7)
rf.fit(X_train, y_train)

In [8]:
y_pred = rf.predict(X_test)
y_pred = [round(y) for y in y_pred]
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[118  21]
 [ 15 124]]
              precision    recall  f1-score   support

           0       0.89      0.85      0.87       139
           1       0.86      0.89      0.87       139

    accuracy                           0.87       278
   macro avg       0.87      0.87      0.87       278
weighted avg       0.87      0.87      0.87       278



In [9]:
df

Unnamed: 0.1,Unnamed: 0,Title,Content,Source,L_Content
0,0,Silicon Valley Bank collapse concerns founders...,"The collapse of Silicon Valley Bank, a major f...",1,"['collaps', 'silicon', 'valley', 'bank', 'majo..."
1,1,Mother of young boy who shot teacher arrested ...,"In a shocking turn of events, the mother of th...",1,"['shock', 'turn', 'event', 'mother', 'young', ..."
2,2,660-mile rescue flight highlights Alaska's uni...,"ANCHORAGE, Alaska (AP) — A recent rescue missi...",1,"['anchorag', 'alaska', 'ap', '—', 'recent', 'r..."
3,3,"Suspect in leak probe talked about God, guns a...",WASHINGTON (AP) -- A former U.S. government em...,1,"['washington', 'ap', 'former', 'us', 'govern',..."
4,4,New Ram electric pickup can go up to 500 miles...,The Ram truck brand has been a dominant force ...,1,"['ram', 'truck', 'brand', 'domin', 'forc', 'am..."
...,...,...,...,...,...
1383,1383,"AOC, other pro-choicers want to ignore court r...","Alexandria Ocasio-Cortez, a heroine on the Lef...",0,"['alexandria', 'ocasiocortez', 'heroin', 'left..."
1384,1384,IMF trims global growth outlook as banking tur...,The International Monetary Fund on Tuesday cut...,0,"['intern', 'monetari', 'fund', 'tuesday', 'cut..."
1385,1385,Drake Bell is 'safe' after former Nickelodeon ...,"Former Nickelodeon actor Drake Bell, is no lon...",0,"['former', 'nickelodeon', 'actor', 'drake', 'b..."
1386,1386,Mark Wahlberg’s daughter hilariously impersona...,Mark Wahlberg isn’t the only member of his fam...,0,"['mark', 'wahlberg', 'isn', '’', 't', 'member'..."


In [36]:
# a = a.merge(b, left_index=True, right_index=True, how='left')
