In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [5]:
from nltk.corpus import stopwords
import nltk.corpus
nltk.download('stopwords')

stop = stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Saima\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
import re


def clean_punctuations(marks):
    return re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", marks)


def remove_stopwords(text):
   text = " ".join([word for word in text.split() if word not in (stop)])
   return text


def convert_to_lowercase(string):
    return string.lower()


In [9]:
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")

In [11]:
train_df['text'] = train_df['text'].apply(remove_stopwords)
train_df['text'] = train_df['text'].apply(convert_to_lowercase)
train_df['text'] = train_df['text'].apply(clean_punctuations)

test_df['text'] = test_df['text'].apply(remove_stopwords)
test_df['text'] = test_df['text'].apply(convert_to_lowercase)
test_df['text'] = test_df['text'].apply(clean_punctuations)


In [13]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts of unique words for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [14]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[1].todense().shape)
print(example_train_vectors[1].todense())

# The above tells us that:

# There are 54 unique words (or "tokens") in the first five tweets.
# The first tweet contains only some of those unique tokens - all of the non-zero counts above are the tokens that DO exist in the first tweet.

(1, 36)
[[0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0]]


In [15]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])

In [16]:
count_vectorizer.vocabulary_

{'deeds': 4482,
 'reason': 12800,
 'earthquake': 5254,
 'may': 9948,
 'allah': 1144,
 'forgive': 6371,
 'us': 16489,
 'forest': 6361,
 'fire': 6164,
 'near': 10780,
 'la': 9023,
 'ronge': 13355,
 'sask': 13643,
 'canada': 2944,
 'residents': 13071,
 'asked': 1593,
 'shelter': 14038,
 'place': 11981,
 'notified': 11050,
 'officers': 11214,
 'evacuation': 5692,
 'orders': 11407,
 'expected': 5788,
 '13000': 140,
 'people': 11779,
 'receive': 12820,
 'wildfires': 17113,
 'california': 2897,
 'got': 6922,
 'sent': 13894,
 'photo': 11877,
 'ruby': 13448,
 'alaska': 1093,
 'smoke': 14388,
 'pours': 12174,
 'school': 13723,
 'rockyfire': 13310,
 'update': 16445,
 'hwy': 7803,
 '20': 258,
 'closed': 3521,
 'directions': 4789,
 'due': 5175,
 'lake': 9046,
 'county': 3973,
 'cafire': 2867,
 'flood': 6258,
 'disaster': 4811,
 'heavy': 7381,
 'rain': 12659,
 'causes': 3117,
 'flash': 6222,
 'flooding': 6260,
 'streets': 14952,
 'manitou': 9790,
 'colorado': 3656,
 'springs': 14702,
 'areas': 1473,

In [17]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

In [18]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.60544581, 0.51788413, 0.5928174 ])

In [19]:
clf.fit(train_vectors, train_df["target"])

RidgeClassifier()

In [20]:
sample_submission = pd.read_csv("./sample_submission.csv")

In [21]:
sample_submission["target"] = clf.predict(test_vectors)

In [22]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,1
4,11,1


In [23]:
sample_submission.to_csv("submission.csv", index=False)

In [24]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import jaccard_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
# import xgboost as xgb

In [42]:

# clf = GradientBoostingClassifier() baje
# clf = KNeighborsClassifier(n_neighbors=5) baje
# clf = clf=svm.SVC(gamma="scale") motamuti
# clf = AdaBoostClassifier(n_estimators=100, random_state=0) baje
clf = DecisionTreeClassifier(random_state=0) # motamoti
# clf = MLPClassifier(random_state=1, max_iter=300)
X_train = train_vectors
y_train = train_df["target"]
X_test = test_vectors
clf.fit(X_train, y_train)

sample_submission = pd.read_csv("./sample_submission.csv")
sample_submission["target"] = clf.predict(X_test)

acc_train=format(accuracy_score(clf.predict(X_train), y_train),'.3f')
f1_train=format(f1_score(y_train,clf.predict(X_train), average='macro'),'.3f')



In [43]:
print(acc_train)
print(f1_train)

0.988
0.987


In [44]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=5, scoring="f1")
scores

array([0.54240447, 0.48878577, 0.54257724, 0.43760984, 0.61014493])

In [None]:
sample_submission.to_csv("submission.csv", index=False)