In [1]:
import pandas as pd


In [2]:
train_df = pd.read_csv("datasets/train.csv")
test_df = pd.read_csv("datasets/test.csv")

In [3]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


## Preprocessing Training data

In [5]:
import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

In [6]:
## 1. Lower All the Cases
train_df['text'] =train_df['text'].str.lower()

## Removing Special Characters
train_df['text'] = train_df['text'].apply(lambda x:re.sub('[^a-z A-Z 0-9-]+','',x))

## Removing the stopwords
# train_df['text'] = train_df['text'].apply(lambda x:" ".join([y for y in x.split() if y not in stopwords.words('english')]))
stop_words = set(stopwords.words('english'))
train_df['text'] = train_df['text'].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))

## Removing url
train_df['text'] = train_df['text'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?','', str(x)))

## Removing  html tags
train_df['text'] = train_df['text'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())

## Removing any additional spaces
train_df['text'] = train_df['text'].apply(lambda x: " ".join(x.split()))

In [None]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,deeds reason earthquake may allah forgive us,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,residents asked shelter place notified officer...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,got sent photo ruby alaska smoke wildfires pou...,1


In [8]:
## Lemmatizer 
from  nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [9]:
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

train_df['text'] = train_df['text'].apply(lambda x:lemmatize_words(x))

In [10]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquake may allah forgive u,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,resident asked shelter place notified officer ...,1
3,6,,,13000 people receive wildfire evacuation order...,1
4,7,,,got sent photo ruby alaska smoke wildfire pour...,1


## Preprocessing of Test data

In [11]:
## 1. Lower All the Cases
test_df['text'] =test_df['text'].str.lower()

## Removing Special Characters
test_df['text'] = test_df['text'].apply(lambda x:re.sub('[^a-z A-Z 0-9-]+','',x))

## Removing the stopwords
# test_df['text'] = test_df['text'].apply(lambda x:" ".join([y for y in x.split() if y not in stopwords.words('english')]))
stop_words = set(stopwords.words('english'))
test_df['text'] = test_df['text'].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))

## Removing url
test_df['text'] = test_df['text'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?','', str(x)))

## Removing  html tags
test_df['text'] = test_df['text'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())

## Removing any additional spaces
test_df['text'] = test_df['text'].apply(lambda x: " ".join(x.split()))

In [12]:
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

test_df['text'] = test_df['text'].apply(lambda x:lemmatize_words(x))

In [13]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,happened terrible car crash
1,2,,,heard earthquake different city stay safe ever...
2,3,,,forest fire spot pond goose fleeing across str...
3,9,,,apocalypse lighting spokane wildfire
4,11,,,typhoon soudelor kill 28 china taiwan


In [14]:
X_train = train_df["text"]
y_train = train_df["target"]
X_test = test_df["text"]

## Training Word to vec on train and test data

In [15]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np

In [16]:
# Tokenize the reviews
X_train_tokens = [word_tokenize(review.lower()) for review in X_train]
X_test_tokens = [word_tokenize(review.lower()) for review in X_test]

In [17]:
# Train Word2Vec on training tokens
word2vec_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=2, workers=4, sg=1)
word2vec_model = Word2Vec(sentences=X_test_tokens, vector_size=100, window=5, min_count=2, workers=4, sg=1)

In [18]:
def avg_word2vec(tokens, model, vector_size):
    """Compute the average Word2Vec for a list of tokens."""
    vectors = [model.wv[word] for word in tokens if word in model.wv.index_to_key]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

In [19]:
# Convert reviews to average Word2Vec vectors
X_train_avg = np.array([avg_word2vec(tokens, word2vec_model, 100) for tokens in X_train_tokens])
X_test_avg = np.array([avg_word2vec(tokens, word2vec_model, 100) for tokens in X_test_tokens])

In [20]:
X_train_avg.shape

(7613, 100)

In [21]:
X_train_avg[0]

array([-1.02867678e-01,  1.60298988e-01,  1.63621791e-02,  1.16022630e-02,
       -2.38105953e-02, -2.61495262e-01,  7.22299051e-03,  2.21647695e-01,
       -9.18482691e-02, -1.05311774e-01, -1.13378070e-01, -2.15209752e-01,
        2.57335277e-03,  7.30944574e-02,  3.22373696e-02, -1.87296599e-01,
        9.85176861e-03, -2.20953703e-01,  6.77561909e-02, -2.66245186e-01,
        1.02455400e-01,  2.09868215e-02,  1.42470345e-01,  2.78335549e-02,
       -7.21919388e-02, -1.00523829e-02, -1.75295740e-01, -8.45400915e-02,
       -9.78691727e-02, -2.87717078e-02,  1.36251315e-01,  5.60353983e-05,
       -4.77746576e-02, -1.10933743e-01, -1.13100745e-01,  1.42477199e-01,
       -4.80682179e-02, -1.21816374e-01, -1.35119036e-02, -2.06172466e-01,
       -3.15023176e-02, -5.13692684e-02, -4.73002568e-02,  1.11297090e-02,
        5.97353205e-02,  5.30744120e-02, -2.15583235e-01, -2.26286538e-02,
        1.43696249e-01,  9.13152322e-02,  3.54549959e-02, -1.21163130e-01,
       -1.49871945e-01, -

In [22]:
y_train.shape

(7613,)

In [23]:
# from sklearn.linear_model import LogisticRegression
# # Initialize Logistic Regression model
# log_reg = LogisticRegression(max_iter=1000)

# # Train the model
# log_reg.fit(X_train_avg, y_train)

# # Predict on the test set
# y_pred_LR = log_reg.predict(X_test_avg)

In [24]:
# submission = pd.DataFrame({"id": test_df["id"], "target": y_pred_LR})
# submission.to_csv("submission.csv", index=False)

In [25]:
# from sklearn.svm import SVC

# svm = SVC(kernel='rbf', C=10, gamma='scale', probability=True)
# svm.fit(X_train_avg, y_train)
# y_pred_svm = svm.predict(X_test_avg)

# submission = pd.DataFrame({"id": test_df["id"], "target": y_pred_svm})
# submission.to_csv("submission.csv", index=False)


In [26]:
# from xgboost import XGBClassifier
# from sklearn.model_selection import RandomizedSearchCV

# param_grid = {
#     'n_estimators': [100, 200, 300],  # Number of boosting rounds
#     'max_depth': [3, 5, 7],  # Lower depth prevents overfitting
#     'learning_rate': [0.01, 0.05, 0.1],  # Controls step size
#     'subsample': [0.7, 0.8, 1],  # Prevents overfitting
#     'colsample_bytree': [0.7, 0.8, 1],  # Feature selection per tree
#     'gamma': [0, 0.1, 0.2],  # Regularization
#     'reg_lambda': [0, 1, 10]  # L2 Regularization
# }

# xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")
# random_search = RandomizedSearchCV(xgb, param_distributions=param_grid, n_iter=10, cv=5, scoring="accuracy", n_jobs=-1)
# random_search.fit(X_train_avg, y_train)

# best_xgb = random_search.best_estimator_
# y_pred_xgb = best_xgb.predict(X_test_avg)


In [27]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting rounds
    'max_depth': [3, 5, 7],  # Lower depth prevents overfitting
    'learning_rate': [0.01, 0.05, 0.1],  # Controls step size
    'subsample': [0.7, 0.8, 1],  # Prevents overfitting
    'colsample_bytree': [0.7, 0.8, 1],  # Feature selection per tree
    'gamma': [0, 0.1, 0.2],  # Regularization
    'reg_lambda': [0, 1, 10]  # L2 Regularization
}
xgb = XGBClassifier(random_state=42, eval_metric="logloss")

# Perform Randomized Search CV
random_search = RandomizedSearchCV(
    xgb, 
    param_distributions=param_grid, 
    n_iter=10, 
    cv=5, 
    scoring="accuracy", 
    n_jobs=-1
)

# Train model
random_search.fit(X_train_avg, y_train)

# Get best model
best_xgb = random_search.best_estimator_

# Make predictions
y_pred_xgb = best_xgb.predict(X_test_avg)


In [28]:
submission = pd.DataFrame({"id": test_df["id"], "target": y_pred_xgb})
submission.to_csv("submission.csv", index=False)