In [None]:
import pandas as pd
import re

df = pd.read_csv('Book3.csv', encoding='latin-1', nrows=50000, error_bad_lines=False)
df.columns = ["sentiment", "id", "date", "flag", "user", "tweet_text"]

# Convert sentiment 4 to 1
df['sentiment'] = df['sentiment'].replace(4, 1)

# Preprocessing
df = df.dropna(subset=['tweet_text', 'sentiment'])  # Drop rows with missing values

# Function to clean tweets
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # remove urls
    text = re.sub(r'\@\w+|\#', '', text)  # remove at and hashtag
    text = text.encode('ascii', 'ignore').decode('ascii')  # remove non-ascii characters
    return text

df['tweet_text'] = df['tweet_text'].apply(clean_text)

# Print the first few rows of the dataframe
print(df.head())



  df = pd.read_csv('Book3.csv', encoding='latin-1', nrows=50000, error_bad_lines=False)


   sentiment          id                          date      flag  \
0          0  1548274671  Fri Apr 17 20:30:31 PDT 2009  NO_QUERY   
1          0  1548274782  Fri Apr 17 20:30:34 PDT 2009  NO_QUERY   
2          0  1548275152  Fri Apr 17 20:30:38 PDT 2009  NO_QUERY   
3          0  1548275569  Fri Apr 17 20:30:39 PDT 2009  NO_QUERY   
4          0  1548275799  Fri Apr 17 20:30:43 PDT 2009  NO_QUERY   

              user                                         tweet_text  
0     xoLovebug224  Working on my songg for aunt nan.   kinda hard...  
1      Kerry_Baker  can't sleep  it's 4.30am and i have to be up a...  
2  glamorusindie81  wishing i could be at coachella this weekend  ...  
3            WOnet  Well   was having a tough day/night. Wanted  t...  
4  jessicakornberg  taking some much needed naked time.  too bad i...  


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Preprocessing
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['tweet_text'])  
y = df['sentiment'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Modele
models = {
    "Naive Bayes": MultinomialNB(),
    "SVM": SVC(probability=True),
    "Random Forest": RandomForestClassifier()
}

# Trening i ewaluacja
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {name}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))
    print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))
    print()

Model: Naive Bayes
Confusion Matrix:
[[2776 1321]
 [1069 4834]]
Accuracy Score: 0.761
ROC-AUC Score: 0.7482372970458753

Model: SVM
Confusion Matrix:
[[2473 1624]
 [ 701 5202]]
Accuracy Score: 0.7675
ROC-AUC Score: 0.7424296114827826

Model: Random Forest
Confusion Matrix:
[[2705 1392]
 [1154 4749]]
Accuracy Score: 0.7454
ROC-AUC Score: 0.7323726913554173



In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score

# Preprocessing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['tweet_text'])
sequences = tokenizer.texts_to_sequences(df['tweet_text'])
data = pad_sequences(sequences, maxlen=100)  # replace 100 with the length you want

X_train, X_val, y_train, y_val = train_test_split(data, y, test_size=0.2, random_state=42)

# Model
model = Sequential()
model.add(Embedding(10000, 128))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss', mode='min')
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

# Fiting
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, callbacks=[checkpoint, early_stop])

# Wczytanie najlepszego modelu
model.load_weights('best_model.h5')

y_pred = (model.predict(X_test) > 0.5).astype('int32')

# Ewaluacja
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 6: early stopping
Confusion Matrix:
[[   0 4097]
 [   0 5903]]
Accuracy Score: 0.5903
ROC-AUC Score: 0.5


In [None]:
from keras.layers import GRU

# Model
model_gru = Sequential()
model_gru.add(Embedding(10000, 128))
model_gru.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))
model_gru.add(Dense(1, activation='sigmoid'))

model_gru.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

checkpoint_gru = ModelCheckpoint('best_model_gru.h5', save_best_only=True, monitor='val_loss', mode='min')

# Fiting
model_gru.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, callbacks=[checkpoint_gru])

# Wczytanie najlepszego modelu
model_gru.load_weights('best_model_gru.h5')

y_pred_gru = (model_gru.predict(X_test) > 0.5).astype('int32')

# Ewaluacja
print("Confusion Matrix for GRU:")
print(confusion_matrix(y_test, y_pred_gru))
print("Accuracy Score for GRU:", accuracy_score(y_test, y_pred_gru))
print("ROC-AUC Score for GRU:", roc_auc_score(y_test, y_pred_gru))


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification, create_optimizer
from sklearn.model_selection import train_test_split

# Preprocessing
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(df['tweet_text'].tolist(), return_tensors='tf', padding=True, truncation=True, max_length=512)

input_ids, X_test, y_train, y_test = train_test_split(inputs['input_ids'], y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(input_ids, y_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Zaczytanie Bert
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Fine-tuning
optimizer = create_optimizer(init_lr=1e-5, num_train_steps=5000, num_warmup_steps=500)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

checkpoint = ModelCheckpoint('best_bert_model.h5', save_best_only=True, monitor='val_loss', mode='min')

# Fiting
model.fit([X_train, y_train], validation_data=([X_val, y_val]), epochs=3, batch_size=8, callbacks=[checkpoint])

model.load_weights('best_bert_model.h5')

y_pred = model.predict([X_test]).logits.argmax(axis=-1)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))

In [None]:
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification

# Preprocessing
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
inputs = tokenizer(df['tweet_text'].tolist(), return_tensors='tf', padding=True, truncation=True, max_length=512)

input_ids, X_test, y_train, y_test = train_test_split(inputs['input_ids'], y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(input_ids, y_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Zaczytanie RoBERTa
model_roberta = TFRobertaForSequenceClassification.from_pretrained('roberta-base')

# Fine-tuning
optimizer = create_optimizer(init_lr=1e-5, num_train_steps=5000, num_warmup_steps=500)
model_roberta.compile(optimizer=optimizer, loss=model_roberta.compute_loss, metrics=['accuracy'])

checkpoint = ModelCheckpoint('best_roberta_model.h5', save_best_only=True, monitor='val_loss', mode='min')

# Fiting
model_roberta.fit([X_train, y_train], validation_data=([X_val, y_val]), epochs=3, batch_size=8, callbacks=[checkpoint])

# Wczytanie najlepszego modelu
model_roberta.load_weights('best_roberta_model.h5')

y_pred = model_roberta.predict([X_test]).logits.argmax(axis=-1)

# Ewaluacja
print("Confusion Matrix for RoBERTa:")
print(confusion_matrix(y_test, y_pred))
print("Accuracy Score for RoBERTa:", accuracy_score(y_test, y_pred))
print("ROC-AUC Score for RoBERTa:", roc_auc_score(y_test, y_pred))
