In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

In [4]:
df_training = pd.read_csv('twitter_training.csv', header=None, names=['Tweet ID', 'entity', 'sentiment', 'Tweet content'])
df_validation = pd.read_csv('twitter_validation.csv', header=None, names=['Tweet ID', 'entity', 'sentiment', 'Tweet content'])

# Merge the two datasets
df = pd.concat([df_training, df_validation], ignore_index=True)

# Filter the dataset by 'Tweet content' for LeagueOfLegends
df_league_of_legends = df[df['entity'].str.contains('LeagueOfLegends', case=False, na=False)]

# Drop rows with missing values in 'Tweet content'
df_league_of_legends = df_league_of_legends.dropna(subset=['Tweet content'])

# Shuffle the dataset
df_league_of_legends = shuffle(df_league_of_legends, random_state=42)

# Split the dataset into training and validation sets
X = df_league_of_legends['Tweet content']
y = df_league_of_legends['sentiment']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=42)

# Display information about the resulting datasets
print("Training set size:", len(X_train))
print("Validation set size:", len(X_validation))

Training set size: 1931
Validation set size: 483


In [5]:
# Models
models = [
    MultinomialNB(),
    SVC(),
    RandomForestClassifier(),
    LogisticRegression(),
    GradientBoostingClassifier()
]

# Feature extraction methods
vectorizers = [
    ('TF-IDF', TfidfVectorizer()),
    ('Count Vectorizer', CountVectorizer())
]

# Example sentences
new_examples = [
    "I love playing League of Legends!",
    "This game is terrible. I hate it.",
    "Neutral tweet about League of Legends."
]

# Loop through models and vectorizers for training and evaluation
for model in models:
    for vectorizer_name, vectorizer in vectorizers:
        # Define a pipeline with feature engineering and a machine learning algorithm
        pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', model)
        ])

        # Train the model
        pipeline.fit(X_train, y_train)

        # Evaluate the model
        y_pred = pipeline.predict(X_validation)
        accuracy = accuracy_score(y_validation, y_pred)
        report = classification_report(y_validation, y_pred)

        # Display results
        print(f"\nModel: {model.__class__.__name__}, Vectorizer: {vectorizer_name}")
        print("Accuracy:", accuracy)
        print("Classification Report:\n", report)
        
        # Predict the sentiment of new examples
        predicted_sentiments = pipeline.predict(new_examples)
        for example, sentiment in zip(new_examples, predicted_sentiments):
            print(f"Example: '{example}' - Predicted Sentiment: {sentiment}")
        print("\n------------------------------------------------------------------------------------")



Model: MultinomialNB, Vectorizer: TF-IDF
Accuracy: 0.8902691511387164
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       1.00      0.52      0.68        66
    Negative       0.86      0.97      0.91       117
     Neutral       0.86      0.96      0.91       175
    Positive       0.95      0.92      0.93       125

    accuracy                           0.89       483
   macro avg       0.92      0.84      0.86       483
weighted avg       0.90      0.89      0.88       483

Example: 'I love playing League of Legends!' - Predicted Sentiment: Positive
Example: 'This game is terrible. I hate it.' - Predicted Sentiment: Negative
Example: 'Neutral tweet about League of Legends.' - Predicted Sentiment: Negative

------------------------------------------------------------------------------------

Model: MultinomialNB, Vectorizer: Count Vectorizer
Accuracy: 0.927536231884058
Classification Report:
               precision    recall  f1-score 