# Random Forest

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report,confusion_matrix
import numpy as np

In [None]:
df = pd.read_csv("processed_data.csv")

In [None]:
# Concatenate the list of words into a single string
df['L_Content'] = df['L_Content'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
# Convert the 'Content' column to lowercase
df['L_Content'] = df['L_Content'].str.lower()

In [None]:
cv = CountVectorizer(ngram_range=(1,2), min_df=10)
vectorized_content = cv.fit_transform(df['L_Content'])

In [None]:
# Split data into test and training dataset
X=vectorized_content
# x=x.merge()
y=df['Source'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
treeclf = DecisionTreeClassifier(max_depth=4)
treeclf.fit(X_train, y_train)
y_pred = treeclf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
rf = RandomForestRegressor(n_estimators = 1000, max_depth=7)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)
y_pred = [round(y) for y in y_pred]
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
df1 = pd.read_csv("processed_with_attributes.csv")

In [None]:
count_vect_df = pd.DataFrame(vectorized_content.todense(), columns=cv.get_feature_names_out())
df1 = df1.merge(count_vect_df, left_index=True, right_index=True, how='left')
df1.columns

In [None]:
df1 = df1.drop(labels=['Unnamed: 0.1', 'Unnamed: 0', 'Content', 'L_Content', 'Title'], axis=1)
df1 = df1.dropna()

In [None]:
X= df1.drop(labels=['Source'], axis=1)
X=X.dropna()
y=df1['Source'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
rf = RandomForestRegressor(n_estimators = 50, max_depth=7)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)
y_pred = [round(y) for y in y_pred]
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# rf = RandomForestRegressor()
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# rf_random.fit(X_train, y_train)

In [None]:
# optimal = rf_random.best_estimator_
# optimal.fit(X_train, y_train)
# y_pred = optimal.predict(X_test)
# y_pred = [round(y) for y in y_pred]
# print(confusion_matrix(y_test,y_pred))
# print(classification_report(y_test,y_pred))

In [None]:
# optimal.get_params

## Random Forest using parameters found from RandomSearchCV

In [None]:
test = RandomForestRegressor(bootstrap=False, max_features='sqrt', n_estimators=400, max_depth=7)
test.fit(X_train, y_train)
y_pred = test.predict(X_test)
y_pred = [round(y) for y in y_pred]
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
full_y_test = []
full_y_pred = []

for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    full_y_test = full_y_test + y_test.tolist()

    test = RandomForestRegressor(bootstrap=False, max_features='sqrt', n_estimators=400, max_depth=7)
    test.fit(X_train, y_train)
    y_pred = test.predict(X_test)
    y_pred = [round(y) for y in y_pred]
    full_y_pred = full_y_pred + y_pred
    
print(confusion_matrix(full_y_test,full_y_pred))
print(classification_report(full_y_test,full_y_pred))