In [None]:
import numpy  as np
import pandas as pd

import matplotlib.pyplot as plt

# Analisi e pulizia dati

In [None]:
df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df_test  = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

df_full = pd.concat([df_train, df_test])

In [None]:
df_full['clean_text'] = df_full.text.replace(regex='(@\w+)|#|&|!', value='')

In [None]:
!pip install pyspellchecker

In [None]:
from tqdm import tqdm

tqdm.pandas()

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()

def correct_spellings(text):
    
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    
    return " ".join([i for i in corrected_text if i != None])
        

In [None]:
df_full['clean_text'] = df_full.clean_text.progress_apply(lambda x: correct_spellings(x))

# Word2Vec


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop=set(stopwords.words('english'))

In [None]:
def create_corpus(df):
    corpus=[]
    for tweet in tqdm(df['clean_text']):
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus

corpus = create_corpus(df_full)

In [148]:
from gensim.models import Word2Vec

w2v = Word2Vec(sentences=corpus, vector_size=50)

In [149]:
def average_word_vectors(sentence, word_embeddings):
    vectors = [word_embeddings[word] for word in sentence if word in word_embeddings]
    if not vectors:
        return np.zeros(word_embeddings.vector_size)
    return np.mean(vectors, axis=0)


train_data = np.array([average_word_vectors(i, w2v.wv) for i in corpus[:len(df_train)]])
test_data = np.array([average_word_vectors(i, w2v.wv) for i in corpus[len(df_train):]])
labels = df_train.target

In [150]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(train_data, labels, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6730137885751806


In [154]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm', SVC(kernel='linear')),
]

stacked_model = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression())
stacked_model.fit(X_train, y_train)

predictions = stacked_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7255416940249507


In [151]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

predictions = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6625082074852265


In [152]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

predictions = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7301378857518056


# Prova con LSTM

In [155]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [199]:
tweets = df_full.clean_text.values

In [212]:
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")  
tokenizer.fit_on_texts(tweets)

In [213]:
sequences = tokenizer.texts_to_sequences(tweets[:len(df_train)])

In [214]:
max_sequence_length = 17
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

In [216]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df_train.target, test_size=0.2, random_state=42)

In [223]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

embedding_dim = 50  

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

accuracy = model.evaluate(X_test, y_test)[1]
print(f"Test Accuracy: {accuracy}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Accuracy: 0.7327643036842346
