# Exploratory Data Analysis

In [23]:
import os
import csv
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt

# Dataframes loading

In [19]:
#get folder path
folder_path = os.path.dirname(os.path.dirname(os.path.realpath("EDA.ipynb")))

#get small data path
small_neg_path = os.path.join(folder_path, "twitter-datasets", "train_neg.txt")
small_pos_path = os.path.join(folder_path, "twitter-datasets", "train_pos.txt")
test_path = os.path.join(folder_path, "twitter-datasets", "test_data.txt")

#create small data dataframe
with open(small_neg_path, 'r') as file:
    lines_neg = file.readlines()
with open(small_pos_path, 'r') as file:
    lines_pos = file.readlines()
with open(test_path, 'r') as file:
    lines_test = file.readlines()

small_neg_df = pd.DataFrame({'Tweets': lines_neg, 'Sentiment': -1})
small_pos_df = pd.DataFrame({'Tweets': lines_pos, 'Sentiment': 1})
test_df= pd.DataFrame({'Tweets': lines_test})

# Preprocessing

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Combine your positive and negative dataframes
combined_df = pd.concat([small_neg_df, small_pos_df], ignore_index=True)

# Assuming 'Tweets' contains the tweet text and 'Sentiment' contains the sentiment labels
X = combined_df['Tweets']
y = combined_df['Sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))  # You can adjust max_features as needed

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

X_pred= tfidf_vectorizer.transform(test_df['Tweets'])

# Logistic regression

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Logistic Regression model
logreg_model = LogisticRegression(random_state=42)

# Train the model on the TF-IDF transformed training data
logreg_model.fit(X_train_tfidf, y_train)

# Predict sentiment on the testing data
y_pred = logreg_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display additional metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.80

Classification Report:
              precision    recall  f1-score   support

          -1       0.82      0.78      0.80     19993
           1       0.79      0.82      0.81     20007

    accuracy                           0.80     40000
   macro avg       0.80      0.80      0.80     40000
weighted avg       0.80      0.80      0.80     40000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Gradient boosting

In [15]:
from xgboost import XGBClassifier

# Convert -1 to 0 in sentiment labels
y_train_binary = y_train.map({-1: 0, 1: 1})
y_test_binary = y_test.map({-1: 0, 1: 1})

# Initialize the XGBoost model
xgb_model = XGBClassifier(random_state=42)

# Train the model on the TF-IDF transformed training data
xgb_model.fit(X_train_tfidf, y_train_binary)

# Predict sentiment on the testing data
y_pred_xgb = xgb_model.predict(X_test_tfidf)

# Evaluate the model
accuracy_xgb = accuracy_score(y_test_binary, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.2f}")

# Display additional metrics
print("\nClassification Report:")
print(classification_report(y_test_binary, y_pred_xgb))



  if is_sparse(data):


XGBoost Accuracy: 0.79

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.71      0.77     19993
           1       0.75      0.88      0.81     20007

    accuracy                           0.79     40000
   macro avg       0.80      0.79      0.79     40000
weighted avg       0.80      0.79      0.79     40000



# SVC

In [26]:
from sklearn.svm import SVC

# Initialize the SVM model
svm_model = SVC(kernel='linear', random_state=42)

# Train the model on the TF-IDF transformed training data
svm_model.fit(X_train_tfidf, y_train)

# Predict sentiment on the testing data
y_pred_svm = svm_model.predict(X_test_tfidf)

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.2f}")

# Display additional metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))


# random forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=10, random_state=42)

# Train the model on the TF-IDF transformed training data
rf_model.fit(X_train_tfidf, y_train)

# Predict sentiment on the testing data
y_pred_rf = rf_model.predict(X_test_tfidf)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.2f}")

# Display additional metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.78

Classification Report:
              precision    recall  f1-score   support

          -1       0.78      0.78      0.78     19993
           1       0.78      0.78      0.78     20007

    accuracy                           0.78     40000
   macro avg       0.78      0.78      0.78     40000
weighted avg       0.78      0.78      0.78     40000



# create submission

In [24]:
def create_csv_submission(ids, y_pred, name):
    """
    This function creates a csv file named 'name' in the format required for a submission in Kaggle or AIcrowd.
    The file will contain two columns the first with 'ids' and the second with 'y_pred'.
    y_pred must be a list or np.array of 1 and -1 otherwise the function will raise a ValueError.

    Args:
        ids (list,np.array): indices
        y_pred (list,np.array): predictions on data correspondent to indices
        name (str): name of the file to be created
    """
    # Check that y_pred only contains -1 and 1
    if not all(i in [-1, 1] for i in y_pred):
        raise ValueError("y_pred can only contain values -1, 1")

    with open(name, "w", newline="") as csvfile:
        fieldnames = ["Id", "Prediction"]
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({"Id": int(r1), "Prediction": int(r2)})

ids=np.arange(1,len(y_pred)+1)
y_subm=logreg_model.predict(X_pred)
create_csv_submission(ids, y_subm, "submission.csv")