<a href="https://colab.research.google.com/github/JavariaTanveer/CCN-Codes/blob/main/random_forest_nd_logistic_regress.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Random Forest**

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

# Function for text cleaning
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove all special characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove all single characters
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)  # Remove single characters from the start
    text = re.sub(r'\s+', ' ', text, flags=re.I)  # Replace multiple spaces with single space
    text = re.sub(r'^b\s+', '', text)  # Remove prefixed 'b'
    text = text.lower()  # Convert to lowercase
    return text

# Upload the dataset file
from google.colab import files
uploaded = files.upload()

# Load the dataset from the uploaded file
df = pd.read_excel(list(uploaded.keys())[0])

# Clean the text data
df['text'] = df['text'].apply(lambda x: clean_text(str(x)))

# Check for any missing values
df.isnull().sum()

# Fill missing values with empty strings
df.fillna('', inplace=True)

# Define the feature (X) and the target (y)
X = df['text']
y = df['label']  # Assuming the labels are in the 'label' column

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF with ngrams
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, ngram_range=(1, 2))

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize the Random Forest classifier with hyperparameter tuning
rf_classifier = RandomForestClassifier()

# Use GridSearchCV to find the best hyperparameters
parameters = {
    'n_estimators': [100, 200],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(rf_classifier, parameters, cv=3, n_jobs=-1)
grid_search.fit(X_train_tfidf, y_train)

# Get the best model
best_rf_classifier = grid_search.best_estimator_

# Train the classifier
best_rf_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = best_rf_classifier.predict(X_test_tfidf)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)


Saving dataset.xlsx to dataset.xlsx
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.904
Confusion Matrix:
[[118   2  23]
 [  0 416  71]
 [  0  48 822]]
Classification Report:
              precision    recall  f1-score   support

        fake       1.00      0.83      0.90       143
       fake        0.89      0.85      0.87       487
        real       0.90      0.94      0.92       870

    accuracy                           0.90      1500
   macro avg       0.93      0.87      0.90      1500
weighted avg       0.91      0.90      0.90      1500



## **Logistic Regression**

In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from google.colab import files
uploaded = files.upload()

# Load the dataset from the uploaded file
df = pd.read_excel(list(uploaded.keys())[0])

# Handle missing values
df['text'].fillna('', inplace=True)

# Preprocess the text data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])
y = df['label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
log_reg = LogisticRegression(max_iter=1000)  # Increase max_iter if needed for convergence
log_reg.fit(X_train, y_train)

# Evaluate the model
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Make predictions
# You can use the trained model to make predictions on new data if needed


Saving dataset.xlsx to dataset (1).xlsx
Accuracy: 0.9273333333333333

Classification Report:
              precision    recall  f1-score   support

        fake       1.00      0.90      0.95       143
       fake        0.90      0.90      0.90       487
        real       0.93      0.95      0.94       870

    accuracy                           0.93      1500
   macro avg       0.94      0.92      0.93      1500
weighted avg       0.93      0.93      0.93      1500

