#Text Classification (AI or Human Written) with Word2Vec and Machine Learning Models

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Importing necessary libraries

In [19]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import joblib
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB

# Defining file paths for the training and testing datasets stored on Google Drive.


In [20]:
train_path="/content/drive/MyDrive/AI_Content_Detector/updated_dataset/train_df.csv"
test_path ="/content/drive/MyDrive/AI_Content_Detector/updated_dataset/test_df.csv"

# Load  datasets


In [21]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Splitting the dataset into features (text) and labels (target).


In [7]:
X_train = train_df['text']
y_train = train_df['label']
X_test = test_df['text']
y_test = test_df['label']

# Preprocess text data


In [8]:
train_corpus = [text.split() for text in X_train]
test_corpus = [text.split() for text in X_test]


# Train Word2Vec model


In [9]:
w2v_model = Word2Vec(
    sentences=train_corpus,  # The input corpus, where each sentence is tokenized into a list of words.
    vector_size=100,         # The dimensionality of the word vectors (size of each word embedding).
    window=5,                # The maximum distance between the current and predicted word in a sentence (context window).
    min_count=2,             # Ignores words that appear less than 2 times in the corpus.
    workers=4,               # Number of worker threads used for training (parallelism).
    sg=0                     # Specifies the training algorithm: 0 for CBOW (Continuous Bag of Words), 1 for Skip-gram.
)

# Save the Word2Vec model

In [10]:
# Save the Word2Vec model
w2v_model.save("word2vec_model.model")



# Function to convert a corpus of tokenized sentences into vectors using a pre-trained Word2Vec model.


In [11]:
# Function to create text embeddings by averaging word vectors
def text_to_vector(corpus, model):
    vectors = []
    for words in corpus:
        word_vecs = [model.wv[word] for word in words if word in model.wv]
        if len(word_vecs) > 0:
            vectors.append(np.mean(word_vecs, axis=0))
        else:
            vectors.append(np.zeros(model.vector_size))
    return np.array(vectors)

- The function handles sentences by averaging the Word2Vec embeddings of words in each sentence.
- If a sentence contains no words present in the Word2Vec model, a zero vector is used as a placeholder.
- The output is a NumPy array where each row corresponds to the vector representation of a sentence.

# Vectorize train and test data


In [12]:
X_train_vectors = text_to_vector(train_corpus, w2v_model)
X_test_vectors = text_to_vector(test_corpus, w2v_model)


# Normalize feature vectors for models like SVM

In [13]:
scaler = MinMaxScaler()
X_train_vectors = scaler.fit_transform(X_train_vectors)
X_test_vectors = scaler.transform(X_test_vectors)

In [14]:
joblib.dump(scaler, 'minmax_scaler.joblib')  # Save the fitted scaler


['minmax_scaler.joblib']

# Initialize models


In [15]:
xgb_model = XGBClassifier(eval_metric='logloss', random_state=47)
svm_model = SVC(probability=True)
nb_model = MultinomialNB()
rf_model = RandomForestClassifier(random_state=47)

In [16]:
models = {
    'XGBoost': xgb_model,
    'SVM': svm_model,
    'Naive Bayes': nb_model,
    'Random Forest': rf_model
}

# Train and evaluate models


In [17]:
results = []

for model_name, model in models.items():
    # Train model
    model.fit(X_train_vectors, y_train)

    # Predict on test data
    y_pred = model.predict(X_test_vectors)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Save the model using joblib
    joblib.dump(model, f"{model_name}_model.joblib")

    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })



# Display results


In [18]:
results_df = pd.DataFrame(results)
print(results_df)

           Model  Accuracy  Precision    Recall  F1 Score
0        XGBoost  0.957137   0.944631  0.970690  0.957483
1            SVM  0.960566   0.956410  0.964655  0.960515
2    Naive Bayes  0.720960   0.677105  0.838793  0.749326
3  Random Forest  0.945135   0.923645  0.969828  0.946173
