# Baseline Model

## Table of Contents
1. [Model Choice](#model-choice)
2. [Feature Selection](#feature-selection)
3. [Implementation](#implementation)
4. [Evaluation](#evaluation)


In [3]:
# Import necessary libraries
import pandas as pd  # Import pandas for data manipulation and analysis.
import matplotlib.pyplot as plt  # Import matplotlib for plotting graphs and visualizations.
from wordcloud import WordCloud  # Import WordCloud for creating word cloud visualizations.
from sklearn.feature_extraction.text import CountVectorizer  # Import CountVectorizer for converting text to numerical data.
from sklearn.utils import shuffle  # Import shuffle to randomize the order of data.
from scipy.sparse import hstack  # Import hstack to combine sparse matrices.
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score  # Import methods for model validation and splitting data.
from sklearn.linear_model import LogisticRegression  # Import LogisticRegression for classification tasks.
from sklearn.ensemble import RandomForestClassifier  # Import RandomForestClassifier for classification using an ensemble of decision trees.
from sklearn.metrics import accuracy_score, classification_report  # Import metrics to evaluate model performance.
from sklearn.svm import SVC  # Import Support Vector Classifier for classification tasks.
from sklearn.neighbors import KNeighborsClassifier  # Import KNeighborsClassifier for classification using K-nearest neighbors.
from sklearn.naive_bayes import MultinomialNB  # Import MultinomialNB for Naive Bayes classification.


## Model Choice

[Explain why you've chosen a particular model as the baseline. This could be a simple statistical model or a basic machine learning model. Justify your choice.]


## Feature Selection

[Indicate which features from the dataset you will be using for the baseline model, and justify your selection.]


In [8]:
# Load the dataset
# Replace 'your_dataset.csv' with the path to your actual dataset
df = pd.read_csv('/content/Combined-Text-Dataset.csv')

# Feature selection
# Example: Selecting only two features for a simple baseline model
# X = df[['feature1', 'feature2']]
# y = df['target_variable']

# Splitting the dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#We are initializing separate `CountVectorizer` objects for the `abstract`, `title`, and `keyword` columns. These vectorizers will convert the text data into numerical features suitable for machine learning models.
vectorizer_abstract = CountVectorizer()  # Initialize a CountVectorizer for the 'abstract' column to convert text data into a matrix of token counts.
vectorizer_title = CountVectorizer()  # Initialize a CountVectorizer for the 'title' column for the same purpose.
vectorizer_keyword = CountVectorizer()  # Initialize a CountVectorizer for the 'keyword' column to convert keyword text data into numerical features.


## Transforming Text Data into Numerical Features
X_abstract = vectorizer_abstract.fit_transform(df['abstract'])  # Fit the CountVectorizer on the 'abstract' column and transform the text data into a numerical feature matrix.
X_title = vectorizer_title.fit_transform(df['title'])  # Fit the CountVectorizer on the 'title' column and transform the text data into a numerical feature matrix.
X_keyword = vectorizer_keyword.fit_transform(df['keyword'])  # Fit the CountVectorizer on the 'keyword' column and transform the text data into a numerical feature matrix.

# Combine the feature matrices from 'abstract', 'title', and 'keyword' into a single sparse matrix using horizontal stacking.
X = hstack([X_abstract, X_title, X_keyword])

# Define the target variable 'y' as the 'is_human' column, which contains labels for human-written (1) and AI-generated (0) text.
y = df['is_human']

# Splitting the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Split the data into training and testing sets.
# 'test_size=0.2' means 20% of the data will be used for testing, while 80% will be used for training.
# 'random_state=42' ensures reproducibility by fixing the randomness in the split.


## Implementation

[Implement your baseline model here.]



In [None]:
# Initialize and train the baseline model
# Example for a classification problem using Logistic Regression
# model = LogisticRegression()
# model.fit(X_train, y_train)

# Your implementation code here


In [16]:
# Logistic Regression
# training
results = {}  # Initialize an empty dictionary to store the results of different models.
model = LogisticRegression()  # Create an instance of the LogisticRegression model.
model.fit(X_train, y_train)  # Train the logistic regression model using the training data (X_train and y_train).

# Predictions
y_pred = model.predict(X_test)  # Use the trained logistic regression model to predict labels for the test data (X_test).


In [17]:
# Random Forest
# training
rf_model = RandomForestClassifier()  # Initialize the Random Forest classifier.
rf_model.fit(X_train, y_train)  # Train the Random Forest model using the training data.

# Predictions
y_pred_rf = rf_model.predict(X_test)  # Predict the labels for the test data using the trained Random Forest model.


In [18]:
# Support Vector Machine (SVM)
# training
svm_model = SVC()  # Initialize the Support Vector Machine (SVM) classifier.
svm_model.fit(X_train, y_train)  # Train the SVM model using the training data.

# Predictions
y_pred_svm = svm_model.predict(X_test)  # Predict the labels for the test data using the trained SVM model.


In [19]:
# K-Nearest Neighbors (KNN)
# training
knn_model = KNeighborsClassifier()  # Initialize the K-Nearest Neighbors (KNN) classifier.
knn_model.fit(X_train, y_train)  # Train the KNN model using the training data.

# Predictions
y_pred_knn = knn_model.predict(X_test)  # Predict the labels for the test data using the trained KNN model.


In [20]:
# Naive Bayes
# training
nb_model = MultinomialNB()  # Initialize the Naive Bayes classifier.
nb_model.fit(X_train, y_train)  # Train the Naive Bayes model using the training data.

# Predictions
y_pred_nb = nb_model.predict(X_test)  # Predict the labels for the test data using the trained Naive Bayes model.


## Evaluation

[Clearly state what metrics you will use to evaluate the model's performance. These metrics will serve as a starting point for evaluating more complex models later on.]



In [21]:
# Evaluate the baseline model
# Example for a classification problem
# y_pred = model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)

# For a regression problem, you might use:
# mse = mean_squared_error(y_test, y_pred)

# Your evaluation code here


In [22]:
# Logistic Regression
# evaluation
accuracy = accuracy_score(y_test, y_pred)  # Calculate the accuracy of the model by comparing the predicted labels (y_pred) to the actual labels (y_test).

# results
print(f"Logistic_Regression Accuracy: {accuracy}")  # Print the accuracy of the model, formatted for easy readability.
results['Logistic_Regression'] = accuracy  # Store the accuracy of the logistic regression model in the 'results' dictionary under the key 'Logistic_Regression'.
report = classification_report(y_test, y_pred)  # Generate a classification report that includes precision, recall, F1-score, and support for each class.
print(f"Classification Report:\n{report}")  # Print the classification report to evaluate the model's performance.


Logistic_Regression Accuracy: 0.9818122767132186
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3050
           1       0.99      0.98      0.98      3108

    accuracy                           0.98      6158
   macro avg       0.98      0.98      0.98      6158
weighted avg       0.98      0.98      0.98      6158



In [23]:
# Random Forest
# evaluation
accuracy = accuracy_score(y_test, y_pred_rf)  # Calculate the accuracy of the model by comparing the predicted labels (y_pred) to the actual labels (y_test).

# results
print(f"Random Forest Accuracy: {accuracy}")  # Print the accuracy of the model, formatted for easy readability.
results['Random Forest'] = accuracy  # Store the accuracy of the Random Forest model in the 'results' dictionary under the key 'Random Forest'.
report = classification_report(y_test, y_pred_rf)  # Generate a classification report for the Random Forest model.
print(f"Classification Report:\n{report}")  # Print the classification report to evaluate the model's performance.


Random Forest Accuracy: 0.9699577784995128
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      3050
           1       0.98      0.96      0.97      3108

    accuracy                           0.97      6158
   macro avg       0.97      0.97      0.97      6158
weighted avg       0.97      0.97      0.97      6158



In [24]:
# Support Vector Machine (SVM)
# evaluation
accuracy = accuracy_score(y_test, y_pred_svm)  # Calculate the accuracy of the model by comparing the predicted labels (y_pred) to the actual labels (y_test).

# results
print(f"SVM Accuracy: {accuracy}")  # Print the accuracy of the model, formatted for easy readability.
results['SVM'] = accuracy  # Store the accuracy of the SVM model in the 'results' dictionary under the key 'SVM'.
report = classification_report(y_test, y_pred_svm)  # Generate a classification report for the SVM model.
print(f"Classification Report:\n{report}")  # Print the classification report to evaluate the model's performance.


SVM Accuracy: 0.9787268593699253
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3050
           1       0.98      0.98      0.98      3108

    accuracy                           0.98      6158
   macro avg       0.98      0.98      0.98      6158
weighted avg       0.98      0.98      0.98      6158



In [25]:
# K-Nearest Neighbors (KNN)
# evaluation
accuracy = accuracy_score(y_test, y_pred_knn)  # Calculate the accuracy of the model by comparing the predicted labels (y_pred) to the actual labels (y_test).

# results
print(f"KNN Accuracy: {accuracy}")  # Print the accuracy of the model, formatted for easy readability.
results['KNN'] = accuracy  # Store the accuracy of the KNN model in the 'results' dictionary under the key 'KNN'.
report = classification_report(y_test, y_pred_knn)  # Generate a classification report for the KNN model.
print(f"Classification Report:\n{report}")  # Print the classification report to evaluate the model's performance.


KNN Accuracy: 0.6053913608314387
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.45      0.53      3050
           1       0.58      0.75      0.66      3108

    accuracy                           0.61      6158
   macro avg       0.61      0.60      0.60      6158
weighted avg       0.61      0.61      0.60      6158



In [26]:
# Naive Bayes
# evaluation
accuracy = accuracy_score(y_test, y_pred_nb)  # Calculate the accuracy of the model by comparing the predicted labels (y_pred) to the actual labels (y_test).

# results
print(f"Naive Bayes Accuracy: {accuracy}")  # Print the accuracy of the model, formatted for easy readability.
results['Naive Bayes'] = accuracy  # Store the accuracy of the Naive Bayes model in the 'results' dictionary under the key 'Naive Bayes'.
report = classification_report(y_test, y_pred_nb)  # Generate a classification report for the Naive Bayes model.
print(f"Classification Report:\n{report}")  # Print the classification report to evaluate the model's performance.


Naive Bayes Accuracy: 0.936342968496265
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.95      0.94      3050
           1       0.95      0.92      0.94      3108

    accuracy                           0.94      6158
   macro avg       0.94      0.94      0.94      6158
weighted avg       0.94      0.94      0.94      6158



In [27]:
results  # Display the dictionary containing accuracy scores for all models evaluated so far.


{'Logistic_Regression': 0.9818122767132186,
 'Random Forest': 0.9699577784995128,
 'SVM': 0.9787268593699253,
 'KNN': 0.6053913608314387,
 'Naive Bayes': 0.936342968496265}