### Reference Notbooks
4. Natural language Processing IV (Section 5.4 thne 5.6)
4. NLP Exercise - do with student

### Steps to Follow
1. Try NaiveBayes classiffier first
2. Data Cleaning, lemming, etc
3. TFIDF
5. Try Random Forest (or K-means with only 2 clases?)

In [21]:
# Import necessary libraries
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Importing the Training Data file
df= pd.read_csv('training_data_lowercase.csv', sep='\t', names=['tag', 'text'])

#Defining target and features
X = df['text']
y = df['tag']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
### Initialize resources and define Functions
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to get the correct WordNet POS tag
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,  # Adjectives
                "N": wordnet.NOUN,  # Nouns
                "V": wordnet.VERB,  # Verbs
                "R": wordnet.ADV}   # Adverbs
    return tag_dict.get(tag, wordnet.NOUN)  # Default to noun if POS not found

# Function to lemmatize tokens with POS tags
def lemmatize_text(tokens):
    return [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]

# Function for flexible preprocessing
def preprocess_text(text, clean=True, tokenize=True, lemmatize=True, remove_stopwords=True):
    # Step 1: Basic Cleaning (lowercasing and removing non-alphabetic characters)
    if clean:
        text = text.lower()
        text = ''.join([char if char.isalpha() or char.isspace() else ' ' for char in text])  # Keep spaces between words
    
    # Step 2: Tokenization
    if tokenize:
        tokens = word_tokenize(text)
    else:
        tokens = text.split()  # Split into words even if not tokenizing with NLTK
    
    # Step 3: Stopword Removal
    if remove_stopwords:
        tokens = [word for word in tokens if word not in stop_words]
    
    # Step 4: Lemmatization with POS tagging
    if lemmatize:
        tokens = lemmatize_text(tokens)
    
    # Return processed text
    return tokens if tokenize else ' '.join(tokens)

In [7]:
# Preprocess training and test sets
X_train_clean = X_train.apply(lambda x: preprocess_text(x, clean=True, tokenize=True, lemmatize=True, remove_stopwords=False))
X_test_clean = X_test.apply(lambda x: preprocess_text(x, clean=True, tokenize=True, lemmatize=True, remove_stopwords=False))

In [8]:
# Join the tokens back into strings for vectorization
X_train_final = X_train_clean.apply(lambda tokens: " ".join(tokens))
X_test_final = X_test_clean.apply(lambda tokens: " ".join(tokens))

In [9]:
### VECTORIZATION
# Vectorization using bag of words
bow_vect = CountVectorizer(max_features=1000)

# Fit and transform the training data, transform the test data
X_train_vect = bow_vect.fit_transform(X_train_final).toarray()
X_test_vect = bow_vect.transform(X_test_final).toarray()

# TF-IDF Vectorization
tfidf_vect = TfidfVectorizer(max_features=1000, ngram_range=(1,2))

# Fit and transform the training data, transform the test data
X_train_tfidf = tfidf_vect.fit_transform(X_train_final).toarray()
X_test_tfidf = tfidf_vect.transform(X_test_final).toarray()

In [13]:
### TRAINING AND PREDICTING WITH DIFFERENT MODELS

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000).fit(X_train_vect, y_train)
y_pred_lr = lr_model.predict(X_test_vect)

# Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', random_state=42).fit(X_train_vect, y_train)
y_pred_svm = svm_model.predict(X_test_vect)

# Naive Bayes classifier
nb_model = MultinomialNB().fit(X_train_vect, y_train)
y_pred_nb = nb_model.predict(X_test_vect)

# XGB Classifier
xgb_model = XGBClassifier(random_state=42).fit(list(X_train_vect), y_train)
y_pred_xgb = xgb_model.predict(list(X_test_vect))

In [None]:
# Optional: Save models for future use
""" import pickle
with open('svm_model_w2v.pkl', 'wb') as f:
    pickle.dump(svm_model, f)

with open('logreg_model_w2v.pkl', 'wb') as f:
    pickle.dump(logreg_model, f)

with open('rf_model_w2v.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

with open('xgb_model_w2v.pkl', 'wb') as f:
    pickle.dump(xgb_model, f) """

In [None]:
# Get confusion matrices
cm = confusion_matrix(y_test, y_pred)

# Visualize confusion matrix
plt.figure(figsize=(4, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [25]:
acc_score_lr = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy Score Log Regression: {acc_score_lr:.4f}")
report_lr = classification_report(y_test, y_pred_lr)
#print("Classification Report Log Regression:\n", report_lr)

acc_score_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy Score SVM: {acc_score_svm:.4f}")
report_svm = classification_report(y_test, y_pred_svm)
#print("Classification Report SVM:\n", report_svm)

acc_score_nb = accuracy_score(y_test, y_pred_nb)
print(f"Accuracy Score Naive Bayes: {acc_score_rf:.4f}")
report_nb = classification_report(y_test, y_pred_nb)
#print("Classification Report Random Forest:\n", report_rf)

acc_score_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy Score XGB Classifier: {acc_score_xgb:.4f}")
report_xgb = classification_report(y_test, y_pred_xgb)
#print("Classification Report XGB Classifier:\n", report_xgb)

Accuracy Score Log Regression: 0.9287
Accuracy Score SVM: 0.9281
Accuracy Score Naive Bayes: 0.9089
Accuracy Score XGB Classifier: 0.9122
