#Text Classification (AI or Human Written) with Word2Vec and Machine Learning Models

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import joblib
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score


# Defining file paths for the training and testing datasets stored on Google Drive.


In [None]:
train_path="/content/train_data.csv"
test_path ="/content/test_data.csv"

# Load  datasets


In [None]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
print(train_df.columns)


Index(['text', 'label', 'character_count', 'word_count', 'sentence_count',
       'paragraph_count', 'stopword_count', 'unique_word_count', 'pos_counts',
       'sentiment_polarity', 'sentiment_subjectivity',
       'discourse_marker_count', 'vocab_size', 'sentence_complexity',
       'grammatical_mistakes', 'punctuation_count',
       'sentence_length_difference', 'type_token_ratio'],
      dtype='object')


In [None]:
train_df = train_df.drop(columns=['pos_counts'])
test_df = test_df.drop(columns=['pos_counts'])

In [None]:
X_text_train = train_df['text']
X_features_train = train_df.drop(columns=['text', 'label'])
y_train = train_df['label']

In [None]:
# Splitting features and target for test_df
X_text_test = test_df['text']
X_features_test = test_df.drop(columns=['text', 'label'])
y_test = test_df['label']


In [None]:
sentences = [text.split() for text in X_text_train]

In [None]:
# Train Word2Vec model
word2vec_model = Word2Vec(
    sentences=sentences,  # The input corpus, where each sentence is tokenized into a list of words.
    vector_size=100,         # The dimensionality of the word vectors (size of each word embedding).
    window=5,                # The maximum distance between the current and predicted word in a sentence (context window).
    min_count=2,             # Ignores words that appear less than 2 times in the corpus.
    workers=4,               # Number of worker threads used for training (parallelism).
    sg=0                     # Specifies the training algorithm: 0 for CBOW (Continuous Bag of Words), 1 for Skip-gram.
)

In [None]:
# Save the Word2Vec model
word2vec_model.save("word2vec_model.model")

In [None]:
# Function to create text embeddings by averaging word vectors
def text_to_vector(corpus, model):
    vectors = []
    for words in corpus:
        word_vecs = [model.wv[word] for word in words if word in model.wv]
        if len(word_vecs) > 0:
            vectors.append(np.mean(word_vecs, axis=0))
        else:
            vectors.append(np.zeros(model.vector_size))
    return np.array(vectors)

In [None]:
# Convert text to Word2Vec vectors
X_text_train_vectors = text_to_vector(X_text_train, word2vec_model)
X_text_test_vectors = text_to_vector(X_text_test, word2vec_model)


In [None]:
# Combine Word2Vec vectors with engineered features
X_train_combined = np.hstack((X_text_train_vectors, X_features_train))
X_test_combined = np.hstack((X_text_test_vectors, X_features_test))

In [None]:
scaler = MinMaxScaler()
X_train_vectors = scaler.fit_transform(X_train_combined)
X_test_vectors = scaler.transform(X_test_combined)

In [None]:
joblib.dump(scaler, 'minmax_scaler.joblib')  # Save the fitted scaler

['minmax_scaler.joblib']

In [None]:
xgb_model = XGBClassifier(eval_metric='logloss', random_state=47)
svm_model = SVC(probability=True)
nb_model = MultinomialNB()
rf_model = RandomForestClassifier(random_state=47)

In [None]:
models = {
    'XGBoost': xgb_model,
    'SVM': svm_model,
    'Naive Bayes': nb_model,
    'Random Forest': rf_model
}

In [None]:
results = []

for model_name, model in models.items():
    # Train model
    model.fit(X_train_vectors, y_train)

    # Predict on test data
    y_pred = model.predict(X_test_vectors)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Save the model using joblib
    joblib.dump(model, f"{model_name}_model.joblib")

    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })



In [None]:
results_df = pd.DataFrame(results)
print(results_df)

           Model  Accuracy  Precision    Recall  F1 Score
0        XGBoost  0.973425   0.963682  0.983621  0.973549
1            SVM  0.938706   0.926236  0.952586  0.939227
2    Naive Bayes  0.802400   0.826941  0.762069  0.793181
3  Random Forest  0.961852   0.955745  0.968103  0.961884
