In [1]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [2]:
import pandas as pd
df = pd.read_csv('train.csv')
df.fillna('NA', inplace=True)

In [3]:
df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [59]:
import spacy
nlp = spacy.load('en_core_web_sm')

def preprocess_and_vectorize(text):
    doc = nlp(text)

    filtered_tokens = []
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        if token.like_url:
            filtered_tokens.append('URL')
        else:
            filtered_tokens.append(token.lemma_.lower())
            #0filtered_tokens.append(token.text.lower())
    return wv.get_mean_vector(filtered_tokens)

In [60]:
#df['vector'] = df.text.apply(lambda x: preprocess_and_vectorize(x))
df['vector'] = df.apply(lambda row: preprocess_and_vectorize(f"{row['text']} {row['location']} {row['keyword']}"), axis=1)

In [6]:
df.head()

Unnamed: 0,id,keyword,location,text,target,vector
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[0.054886103, 0.0060629225, 0.05688734, 0.0300..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[0.03859707, -0.00494637, 0.009483899, 0.04513..."
2,5,,,All residents asked to 'shelter in place' are ...,1,"[-0.0048536863, 0.011481234, 0.016771162, -0.0..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[0.059389483, -0.020141622, -0.003634977, 0.05..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[0.013593033, 0.0049750614, -0.024108628, 0.04..."


In [61]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.vector.values, df.target, test_size=0.2, random_state=42)

In [62]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

scaler = MinMaxScaler()
X_train_2d_norm = scaler.fit_transform(X_train_2d)
X_test_2d_norm = scaler.transform(X_test_2d)

In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

clf = RandomForestClassifier()

clf.fit(X_train_2d_norm, y_train)

y_pred = clf.predict(X_test_2d_norm)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.90      0.84       874
           1       0.83      0.65      0.73       649

    accuracy                           0.80      1523
   macro avg       0.81      0.78      0.78      1523
weighted avg       0.80      0.80      0.79      1523



In [65]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()

clf.fit(X_train_2d_norm, y_train)

y_pred = clf.predict(X_test_2d_norm)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.95      0.79       874
           1       0.86      0.40      0.54       649

    accuracy                           0.72      1523
   macro avg       0.77      0.67      0.67      1523
weighted avg       0.76      0.72      0.69      1523



In [66]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

clf = KNeighborsClassifier()

clf.fit(X_train_2d_norm, y_train)

y_pred = clf.predict(X_test_2d_norm)

print(classification_report(y_test, y_pred))

AttributeError: 'NoneType' object has no attribute 'split'

In [68]:
from tensorflow import keras

# Generate some dummy data

# Create a basic neural network model
model = keras.Sequential([
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')  
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train_2d_norm, y_train, epochs=50, batch_size=32)
y_pred = model.predict(X_test_2d_norm)
y_pred_binary = (y_pred > 0.5).astype(int)  # Convert to binary

print(classification_report(y_test, y_pred_binary))


Epoch 1/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 589us/step - accuracy: 0.5974 - loss: 0.6603
Epoch 2/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 558us/step - accuracy: 0.7224 - loss: 0.5579
Epoch 3/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 645us/step - accuracy: 0.7466 - loss: 0.5320
Epoch 4/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 546us/step - accuracy: 0.7533 - loss: 0.5077
Epoch 5/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 541us/step - accuracy: 0.7559 - loss: 0.5107
Epoch 6/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 592us/step - accuracy: 0.7572 - loss: 0.5168
Epoch 7/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 569us/step - accuracy: 0.7761 - loss: 0.4803
Epoch 8/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 527us/step - accuracy: 0.7632 - loss: 0.5006
Epoch 9/50
[1m191/191[