In [2]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


folder_path = "..\\data"
names = ["Facebook", "Reddit", "Twitter", "Youtube"]
dfs = {}

for n in names:
    dfs[n] = pd.read_csv(os.path.join(folder_path, f"cleaned_{n.lower()}.csv"))



In [3]:
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vital\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vital\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

for name in names:
    dfs[name]['tokens'] = dfs[name]['text'].apply(preprocess_text)

In [None]:
#dO DATA cleaning 
#search: data cleaning in text mining to clean twitter data
# after data cleaning i need to check the data

In [5]:
word2vec_models = {}
for name in names:
    sentences = dfs[name]['tokens'].tolist()
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    word2vec_models[name] = model

def get_sentence_embedding(tokens, model):
    # Get the embeddings for each token
    embeddings = [model.wv[word] for word in tokens if word in model.wv]
    # Average the embeddings to get the sentence embedding
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

for name in names:
    dfs[name]['embedding'] = dfs[name]['tokens'].apply(lambda x: get_sentence_embedding(x, word2vec_models[name]))


In [6]:
results = {}
for name in names:
    X = np.vstack(dfs[name]['embedding'].values)
    y = dfs[name]['label']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train classifier
    classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    classifier.fit(X_train, y_train)
    
    # Evaluate classifier
    y_pred = classifier.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    
    results[name] = {
        'classification_report': report,
        'accuracy': accuracy
    }

# Convert results to DataFrame for display
results_df = pd.DataFrame({
    name: {
        'accuracy': results[name]['accuracy'],
        'precision': results[name]['classification_report']['weighted avg']['precision'],
        'recall': results[name]['classification_report']['weighted avg']['recall'],
        'f1-score': results[name]['classification_report']['weighted avg']['f1-score']
    }
    for name in names
}).T

# Display the results DataFrame
print(results_df)

          precision    recall  f1-score  cv_mean_score  cv_std_score
Facebook   0.706187  0.686667  0.582190       0.692250      0.004358
Reddit     0.746937  0.791888  0.722144       0.785472      0.003001
Twitter    0.893175  0.897519  0.894696       0.897188      0.011453
Youtube    0.740678  0.768992  0.708332       0.728037      0.012684
