In [4]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 1. Read the original data and process it
data = pd.read_csv('undersampled_data.csv')

# Split the data into training and testing sets
print("Splitting data using the rate of 80/20") 
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Extract texts and labels
train_texts = train_data['Comment'].tolist()
train_labels = train_data['label'].tolist()
test_texts = test_data['Comment'].tolist()
test_labels = test_data['label'].tolist()

# 2. Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
train_vectors = vectorizer.fit_transform(train_texts)
test_vectors = vectorizer.transform(test_texts)

# 3. Train Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(train_vectors, train_labels)

# Predictions
rf_predictions = rf_classifier.predict(test_vectors)

# Evaluation metrics
rf_accuracy = accuracy_score(test_labels, rf_predictions)
rf_precision = precision_score(test_labels, rf_predictions)
rf_recall = recall_score(test_labels, rf_predictions)
rf_f1 = f1_score(test_labels, rf_predictions)

print("Random Forest Classifier Metrics:")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1 Score: {rf_f1:.4f}")

# 4. Train Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(train_vectors, train_labels)

# Predictions
dt_predictions = dt_classifier.predict(test_vectors)

# Evaluation metrics
dt_accuracy = accuracy_score(test_labels, dt_predictions)
dt_precision = precision_score(test_labels, dt_predictions)
dt_recall = recall_score(test_labels, dt_predictions)
dt_f1 = f1_score(test_labels, dt_predictions)

print("\nDecision Tree Classifier Metrics:")
print(f"Accuracy: {dt_accuracy:.4f}")
print(f"Precision: {dt_precision:.4f}")
print(f"Recall: {dt_recall:.4f}")
print(f"F1 Score: {dt_f1:.4f}")



Splitting data using the rate of 80/20
Random Forest Classifier Metrics:
Accuracy: 0.7037
Precision: 0.7143
Recall: 0.6934
F1 Score: 0.7037

Decision Tree Classifier Metrics:
Accuracy: 0.6148
Precision: 0.6279
Recall: 0.5912
F1 Score: 0.6090


In [5]:

# Split the data into training and testing sets
print("Splitting data using the rate of 70/30") 
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# Extract texts and labels
train_texts = train_data['Comment'].tolist()
train_labels = train_data['label'].tolist()
test_texts = test_data['Comment'].tolist()
test_labels = test_data['label'].tolist()

# 2. Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
train_vectors = vectorizer.fit_transform(train_texts)
test_vectors = vectorizer.transform(test_texts)

# 3. Train Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(train_vectors, train_labels)

# Predictions
rf_predictions = rf_classifier.predict(test_vectors)

# Evaluation metrics
rf_accuracy = accuracy_score(test_labels, rf_predictions)
rf_precision = precision_score(test_labels, rf_predictions)
rf_recall = recall_score(test_labels, rf_predictions)
rf_f1 = f1_score(test_labels, rf_predictions)

print("Random Forest Classifier Metrics:")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1 Score: {rf_f1:.4f}")

# 4. Train Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(train_vectors, train_labels)

# Predictions
dt_predictions = dt_classifier.predict(test_vectors)

# Evaluation metrics
dt_accuracy = accuracy_score(test_labels, dt_predictions)
dt_precision = precision_score(test_labels, dt_predictions)
dt_recall = recall_score(test_labels, dt_predictions)
dt_f1 = f1_score(test_labels, dt_predictions)

print("\nDecision Tree Classifier Metrics:")
print(f"Accuracy: {dt_accuracy:.4f}")
print(f"Precision: {dt_precision:.4f}")
print(f"Recall: {dt_recall:.4f}")
print(f"F1 Score: {dt_f1:.4f}")



Splitting data using the rate of 70/30
Random Forest Classifier Metrics:
Accuracy: 0.6938
Precision: 0.7071
Recall: 0.6796
F1 Score: 0.6931

Decision Tree Classifier Metrics:
Accuracy: 0.6543
Precision: 0.6701
Recall: 0.6311
F1 Score: 0.6500
