In [1]:
# Install the necessary Libraries
# !pip install nltk
# !pip install gensim

In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import pickle

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

from gensim.models import Word2Vec

from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Download the necessary files
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\indra\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# Load the Data
df = pd.read_csv('apple_news_hourly_summarized_spacy.csv')

In [5]:
# Remove the NA
formatted_df = df.dropna()

In [6]:
# Remove the unnecessary columns
formatted_df = formatted_df.copy()

In [7]:
# Convert 'Label' to integers
formatted_df['Label'] = formatted_df['label'].astype(int)

In [8]:
# Data cleaning steps
formatted_df['text'] = formatted_df['text'].apply(lambda x: re.sub(r'<.*?>', '', x))  # Remove HTML tags
formatted_df['text'] = formatted_df['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))  # Remove special characters
formatted_df['text'] = formatted_df['text'].str.lower()  # Convert to lowercase

In [9]:
# Download NLTK stop words
stop_words = set(stopwords.words('english'))

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to remove stop words, Stemming & Lemmatization
def apply_nltk_fns(text):
    words = text.split()  # Split the text into words
    filtered_words = [word for word in words if word not in stop_words]  # Remove stop words
    stemmed_tokens = [stemmer.stem(word) for word in filtered_words]  # Apply stemming
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]  # Apply lemmatization
    return word_tokenize(' '.join(lemmatized_tokens))  # Join the words back into a sentence & tokenize text

# Apply the function to the 'text' column
formatted_df['tokens'] = formatted_df['text'].apply(apply_nltk_fns)

In [10]:
# Train a Word2Vec Model and Initialize and train the Word2Vec model
word2vec_model = Word2Vec(formatted_df['tokens'], vector_size=200, window=5, min_count=1, sg=0)

In [11]:
# Save the model
word2vec_model.save("word2vec.model")

In [12]:
# Feature Extraction:
def get_doc_vector(tokens, model):
    word_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)  # Return zero vector if no known words

# Example: Calculate document vectors for a list of tokenized sentences
document_vectors = [get_doc_vector(tokens, word2vec_model) for tokens in formatted_df['tokens']]

In [13]:
num_folds = 5
max_accuracy = 0
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize a list to store accuracy scores for each fold
accuracy_scores = []

In [14]:
# Loop over the folds
for fold, (train_index, test_index) in enumerate(skf.split(document_vectors, formatted_df['Label'])):
    
    # Split the data into training and testing sets for this fold
    X_train, X_test = np.array(document_vectors)[train_index], np.array(document_vectors)[test_index]
    y_train, y_test = formatted_df['Label'].iloc[train_index], formatted_df['Label'].iloc[test_index]

    # Create an SVM classifier
    classifier = svm.SVC(kernel='rbf', C=10, gamma=0.1, verbose=True, class_weight='balanced')

    # Train the SVM classifier on the training data
    classifier.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = classifier.predict(X_test)

    # Calculate the accuracy of the classifier for this fold
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    
    # Store the model
    if accuracy > max_accuracy:
        max_accuracy = accuracy
        file_save_name = 'svm_model.sav'
        pickle.dump(classifier, open(file_save_name, 'wb'))
        
    # Generate the classification report
    report = classification_report(y_test, y_pred)

    # Print the classification report for each fold
    print(f"Classification Report - Fold {fold + 1}:\n{report}\n")

[LibSVM]Classification Report - Fold 1:
              precision    recall  f1-score   support

           0       0.63      0.63      0.63      7057
           1       0.65      0.65      0.65      7610

    accuracy                           0.64     14667
   macro avg       0.64      0.64      0.64     14667
weighted avg       0.64      0.64      0.64     14667


[LibSVM]Classification Report - Fold 2:
              precision    recall  f1-score   support

           0       0.62      0.63      0.63      7057
           1       0.65      0.64      0.65      7610

    accuracy                           0.64     14667
   macro avg       0.64      0.64      0.64     14667
weighted avg       0.64      0.64      0.64     14667


[LibSVM]Classification Report - Fold 3:
              precision    recall  f1-score   support

           0       0.62      0.62      0.62      7057
           1       0.65      0.65      0.65      7610

    accuracy                           0.63     14667
   mac

In [15]:
# Print the average accuracy across all folds

average_accuracy = np.mean(accuracy_scores)
print(f"Average Accuracy Across {num_folds} Folds: {average_accuracy}")

Average Accuracy Across 5 Folds: 0.6363946273948319
