In [None]:

import pandas as pd
import numpy as np
import requests
from io import StringIO
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import getpass


username = "hachembouhamidi"
password = getpass.getpass("Enter your DBRepo password: ")
session = requests.Session()
session.auth = (username, password)
headers = {
    "Accept": "text/csv"
}


def download_table(database_id, table_id):
    url = f"https://test.dbrepo.tuwien.ac.at/api/database/{database_id}/table/{table_id}/data"
    response = session.get(url, headers=headers)
    if response.status_code == 200:
        csv_content = StringIO(response.text)
        return pd.read_csv(csv_content)
    else:
        raise Exception(f"Failed to download table {table_id}: {response.status_code}")

database_id = "147ae136-e292-4fa0-a3a8-61543690bbe4"
train_table_id = "de96a523-c9ee-45c6-a81c-57fde933bee7"
validation_table_id = "191f1592-f1a9-4d12-a985-77f60bf0e896"
test_table_id = "6673adbc-201e-450b-a697-3297010210c1"

train_df = download_table(database_id, train_table_id)
val_df = download_table(database_id, validation_table_id)
test_df = download_table(database_id, test_table_id)

print(f"Training data shape: {train_df.shape}")
print(f"Validation data shape: {val_df.shape}")
print(f"Test data shape: {test_df.shape}")

text_column = 'originaltweet'
target_column = 'sentiment_category'

print(f"\nUsing column '{text_column}' for text and '{target_column}' for sentiment")

vectorizer = CountVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df[text_column])
X_val = vectorizer.transform(val_df[text_column])
X_test = vectorizer.transform(test_df[text_column])


print("Converting sparse matrices to dense arrays...")
X_train_dense = X_train.toarray()
X_val_dense = X_val.toarray()
X_test_dense = X_test.toarray()
print(f"Dense array shapes - Train: {X_train_dense.shape}, Val: {X_val_dense.shape}, Test: {X_test_dense.shape}")

y_train = train_df[target_column]
y_val = val_df[target_column]
y_test = test_df[target_column]


print("Training model...")
model = HistGradientBoostingClassifier(random_state=42)
model.fit(X_train_dense, y_train)


print("Evaluating model...")
val_predictions = model.predict(X_val_dense)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")

test_predictions = model.predict(X_test_dense)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")

print("\nClassification Report (Test Set):")
report = classification_report(y_test, test_predictions)
print(report)

print("Creating confusion matrix...")
cm = confusion_matrix(y_test, test_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.close()


print("Saving outputs...")
joblib.dump(model, 'sentiment_model.joblib')
joblib.dump(vectorizer, 'count_vectorizer.joblib')

test_df['predicted_sentiment'] = test_predictions
test_df.to_csv('test_predictions_full.csv', index=False)

with open('model_performance.txt', 'w') as f:
    f.write(f"Validation Accuracy: {val_accuracy:.4f}\n")
    f.write(f"Test Accuracy: {test_accuracy:.4f}\n\n")
    f.write("Classification Report:\n")
    f.write(report)

print("All outputs saved successfully!")

Enter your DBRepo password: ··········
Training data shape: (19347, 10)
Validation data shape: (4146, 10)
Test data shape: (4146, 10)

Using column 'originaltweet' for text and 'sentiment_category' for sentiment
Converting sparse matrices to dense arrays...
Dense array shapes - Train: (19347, 5000), Val: (4146, 5000), Test: (4146, 5000)
Training model...
Evaluating model...
Validation Accuracy: 0.5169
Test Accuracy: 0.5142

Classification Report (Test Set):
               precision    recall  f1-score   support

     Negative       0.48      0.42      0.45      1005
      Neutral       0.53      0.77      0.63      1661
     Positive       0.39      0.19      0.26       700
Very Negative       0.49      0.21      0.29       310
Very Positive       0.60      0.48      0.53       470

     accuracy                           0.51      4146
    macro avg       0.50      0.42      0.43      4146
 weighted avg       0.50      0.51      0.49      4146

Creating confusion matrix...
Saving outp

In [None]:
from google.colab import files
files.download('sentiment_model.joblib')
files.download('test_predictions_full.csv')
files.download('model_performance.txt')
files.download('confusion_matrix.png')
files.download('count_vectorizer.joblib')
!zip outputs.zip sentiment_model.joblib test_predictions_full.csv model_performance.txt confusion_matrix.png count_vectorizer.joblib
files.download('outputs.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  adding: sentiment_model.joblib (deflated 68%)
  adding: test_predictions_full.csv (deflated 73%)
  adding: model_performance.txt (deflated 60%)
  adding: confusion_matrix.png (deflated 14%)
  adding: count_vectorizer.joblib (deflated 71%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>