In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
df = pd.read_csv('train.csv')



print(df.shape)
print(df.head())

(7613, 5)
   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [3]:
nlp = spacy.load("en_core_web_sm")
def preprocess(text):
    doc = nlp(text)
    no_stop_words = [token.text for token in doc if not token.is_stop]
    return " ".join(no_stop_words)

df.text = df.text.apply(preprocess)
df = df.fillna("Unknown")

In [10]:
X = df['text']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [11]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,3))),
    ('random_forest', (RandomForestClassifier(n_estimators=50, criterion='entropy')))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.95      0.82      1322
           1       0.89      0.49      0.63       962

    accuracy                           0.76      2284
   macro avg       0.80      0.72      0.73      2284
weighted avg       0.79      0.76      0.74      2284



In [12]:
#1. create a pipeline object

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('lr', (LogisticRegression()))
])


#2. fit with X_train and y_train

pipeline.fit(X_train, y_train)

#3. get the predictions for X_test and store it in y_pred

y_pred = pipeline.predict(X_test)

#4. print the classfication report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.86      0.83      1322
           1       0.78      0.69      0.74       962

    accuracy                           0.79      2284
   macro avg       0.79      0.78      0.78      2284
weighted avg       0.79      0.79      0.79      2284



In [13]:
#1. create a pipeline object

pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,3))),
    ('multi_nb', (MultinomialNB()))
])


#2. fit with X_train and y_train

pipeline.fit(X_train, y_train)

#3. get the predictions for X_test and store it in y_pred

y_pred = pipeline.predict(X_test)

#4. print the classfication report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.81      0.81      1322
           1       0.74      0.72      0.73       962

    accuracy                           0.78      2284
   macro avg       0.77      0.77      0.77      2284
weighted avg       0.77      0.78      0.78      2284



In [80]:
df_test = pd.read_csv('test.csv')
df_test.text = df_test.text.apply(preprocess)
df_test = df_test.fillna("Unkown")
y_pred = pipeline.predict(df_test.text)
d = {'id': df_test.id, 'target': y_pred}
df_out = pd.DataFrame(data=d)
df_out.to_csv('submission.csv', index=False)

!kaggle competitions submit -c nlp-getting-started -f submission.csv -m "Bag of words attempt - RandomForest"
!kaggle competitions submissions -c nlp-getting-started

Successfully submitted to Natural Language Processing with Disaster Tweets



  0%|          | 0.00/25.4k [00:00<?, ?B/s]
 63%|██████▎   | 16.0k/25.4k [00:00<00:00, 66.3kB/s]
100%|██████████| 25.4k/25.4k [00:00<00:00, 50.1kB/s]


fileName        date                 description                          status    publicScore  privateScore  
--------------  -------------------  -----------------------------------  --------  -----------  ------------  
submission.csv  2024-08-09 20:37:42  Bag of words attempt - RandomForest  complete  0.79773                    
submission.csv  2024-08-09 17:49:33  Bag of words attempt - RandomForest  complete  0.78731                    
submission.csv  2024-08-09 17:47:46  Bag of words attempt - RandomForest  complete  0.78731                    
