# Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn as skl
from sklearn import feature_extraction, linear_model, model_selection, preprocessing


# Importing the dataset

In [2]:
# Load training and testing datasets
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")


In [3]:
# Let's take a look at the training dataset
train_df[train_df["target"] == 0]["text"].head()

15                  What's up man?
16                   I love fruits
17                Summer is lovely
18               My car is so fast
19    What a goooooooaaaaaal!!!!!!
Name: text, dtype: object

In [4]:
# Train dataset is targeted, 0 or 1, where 0 means 'not a disaster tweet', so we can use it to train our model

In [5]:
train_df[train_df["target"] == 1]["text"].head()

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

In [6]:
# let's get counts for the first 5 tweets in the data
count_vectorizer = feature_extraction.text.CountVectorizer(stop_words="english", min_df=1)

## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])
example_df = pd.DataFrame(data=example_train_vectors.todense(),columns = count_vectorizer.get_feature_names_out())
print(example_df)
print(example_train_vectors.shape)
print(example_train_vectors.todense())


   000  13  alaska  allah  asked  california  canada  deeds  earthquake  \
0    0   0       0      1      0           0       0      1           1   
1    0   0       0      0      0           0       1      0           0   
2    0   0       0      0      1           0       0      0           0   
3    1   1       0      0      0           1       0      0           0   
4    0   0       1      0      0           0       0      0           0   

   evacuation  ...  receive  residents  ronge  ruby  sask  school  sent  \
0           0  ...        0          0      0     0     0       0     0   
1           0  ...        0          0      1     0     1       0     0   
2           1  ...        0          1      0     0     0       0     0   
3           1  ...        1          0      0     0     0       0     0   
4           0  ...        0          0      0     1     0       1     1   

   shelter  smoke  wildfires  
0        0      0          0  
1        0      0          0  
2    

In [59]:
count_vectorizer = feature_extraction.text.CountVectorizer( min_df=0.001, stop_words="english")
train_vectors = count_vectorizer.fit_transform(train_df["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])
df = pd.DataFrame(data=train_vectors.todense(),columns = count_vectorizer.get_feature_names_out())
print(train_vectors.shape)
print(df)
print(train_vectors.todense())

(7613, 1647)
      00  01  04  05  06  07  08  10  100  11  ...  zone  û_  ûª  ûªs  ûªt  \
0      0   0   0   0   0   0   0   0    0   0  ...     0   0   0    0    0   
1      0   0   0   0   0   0   0   0    0   0  ...     0   0   0    0    0   
2      0   0   0   0   0   0   0   0    0   0  ...     0   0   0    0    0   
3      0   0   0   0   0   0   0   0    0   0  ...     0   0   0    0    0   
4      0   0   0   0   0   0   0   0    0   0  ...     0   0   0    0    0   
...   ..  ..  ..  ..  ..  ..  ..  ..  ...  ..  ...   ...  ..  ..  ...  ...   
7608   0   0   0   0   0   0   0   0    0   0  ...     0   0   0    0    0   
7609   0   0   0   0   0   0   0   0    0   0  ...     0   0   0    0    0   
7610   0   1   1   0   0   0   0   0    0   0  ...     0   0   0    0    0   
7611   0   0   0   0   0   0   0   0    0   0  ...     0   0   0    0    0   
7612   0   0   0   0   0   0   0   0    0   0  ...     0   0   0    0    0   

      ûªve  ûï  ûïwhen  ûò  ûó  
0        0   0   

# Our model

As we mentioned above, we think the words contained in each tweet are a good indicator of whether they're about a real disaster or not. The presence of particular word (or set of words) in a tweet might link directly to whether or not that tweet is real.

What we're assuming here is a linear connection. So let's build a linear model and see!

In [41]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

In [42]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=10, scoring="f1")
scores

array([0.48275862, 0.44736842, 0.48518519, 0.54255319, 0.43911439,
       0.49285714, 0.52075472, 0.43220339, 0.65232975, 0.7299509 ])

# Cross Validation

In [80]:
# First, define the models we want to use
# We use the same random state for each model to be able to compare them
# We use the class_weight parameter to balance the classes
# We use the max_iter parameter to avoid convergence warnings
# We use the n_estimators parameter to avoid convergence warnings
# We use the max_depth parameter to avoid convergence warnings
# We use the l1_ratio parameter to avoid convergence warnings
# We use the eval_metric to logloss for classification

# Download the libraries
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB

# Linear regression, k nearest neighbors, random forest, xgboost, lightgbm, naive bayes, support vector machine
model_LR = skl.linear_model.LogisticRegression(C=0.05, l1_ratio=None, max_iter=10000)
model_KNN = skl.neighbors.KNeighborsClassifier()
model_RF = skl.ensemble.RandomForestClassifier(n_estimators=100, max_depth=2)
model_XGB = XGBClassifier(n_estimators=100, max_depth=2, use_label_encoder=False, eval_metric='logloss')
model_LGBM = LGBMClassifier(n_estimators=100, max_depth=2)
model_SVM = svm.SVC()
model_NBC = skl.naive_bayes.GaussianNB()

# put all those models in a list
models = [model_NBC]

In [82]:
# Download the libraries for the cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
import datetime
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# define a function to perform cross validation for the given models
def cross_validation(model, the_df, the_target, n_splits=10, random_state=71, stratified=True, shuffle=True):
    """
    This function performs cross validation for the given models
    input:
        models: the models to be evaluated
        X: the features
        y: the target
        n_splits: the number of splits
        random_state: the random state
    output:
        a dataframe with the results
    """
    # create a dataframe to store the results
    results = pd.DataFrame(columns=['model', 'accuracy', 'precision', 'recall', 'f1'])
    # Create conditional statement to check if stratified or not
    if stratified:
        # create a stratified k-fold object
        if shuffle:
            kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
        else:
            kf = StratifiedKFold(n_splits=n_splits, shuffle=False)
    else:
        # create a non stratified k-fold object
        if shuffle:
            kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
        else:
            kf = KFold(n_splits=n_splits, shuffle=False) 
    # Define target and features
    X = the_df
    y = the_target
    # loop over the folds
    for train_index, test_index in kf.split(X, y):
        # split the data
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        # loop over the models
        for model in models:
            # fit the model
            time_0 = datetime.datetime.now()
            model.fit(X_train, np.ravel(y_train))
            time_1 = datetime.datetime.now()
            # predict the target
            y_pred = model.predict(X_test)
            # compute the accuracy
            accuracy = accuracy_score(y_test, y_pred)
            # compute the precision
            precision = classification_report(y_test, y_pred, output_dict=True)['1']['precision']
            # compute the recall
            recall = classification_report(y_test, y_pred, output_dict=True)['1']['recall']
            # compute the f1 score
            f1 = classification_report(y_test, y_pred, output_dict=True)['1']['f1-score']
            # compute the time to fit the model in seconds
            time_fit = (time_1 - time_0).total_seconds()

            # store the results in the dataframe sorted by accuracy
            results = results.append({'model': model.__class__.__name__,
                                      'accuracy': accuracy,
                                      'precision': precision,
                                      'recall': recall,
                                      'f1': f1,
                                      'Time to fit': time_fit}, ignore_index=True)

            # group the results by model
            results = results.groupby('model').mean().reset_index().sort_values(by='f1',
                             ascending=False).reset_index(drop=True)

    # return the dataframe
    return results

In [83]:
count_vectorizer = feature_extraction.text.CountVectorizer( min_df=1, stop_words="english")
train_vectors = count_vectorizer.fit_transform(train_df["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])
df = pd.DataFrame(data=train_vectors.todense(),columns = count_vectorizer.get_feature_names_out())
print(train_vectors.shape)
print(df)
print(train_vectors.todense())

(7613, 21363)
      00  000  0000  007npen6lg  00cy9vxeff  00end  00pm  01  02  0215  ...  \
0      0    0     0           0           0      0     0   0   0     0  ...   
1      0    0     0           0           0      0     0   0   0     0  ...   
2      0    0     0           0           0      0     0   0   0     0  ...   
3      0    1     0           0           0      0     0   0   0     0  ...   
4      0    0     0           0           0      0     0   0   0     0  ...   
...   ..  ...   ...         ...         ...    ...   ...  ..  ..   ...  ...   
7608   0    0     0           0           0      0     0   0   0     0  ...   
7609   0    0     0           0           0      0     0   0   0     0  ...   
7610   0    0     0           0           0      0     0   1   0     0  ...   
7611   0    0     0           0           0      0     0   0   0     0  ...   
7612   0    0     0           0           0      0     0   0   0     0  ...   

      ûò  ûò800000  ûòthe  ûòåêcnbc  

In [86]:
# Run the cross validation for the models in the for combinatotions of stratified and shuffle and retarn a 2x2 matrix of results
# stratified = True, shuffle = True
results_stratified_shuffle = cross_validation(models, df, train_df["target"], n_splits=10, random_state=71, stratified=True, shuffle=True)
# stratified = True, shuffle = False
#results_stratified_noshuffle = cross_validation(models, df, train_df["target"], n_splits=10, random_state=71, stratified=True, shuffle=False)
# stratified = False, shuffle = True
#results_nostatified_shuffle = cross_validation(models, df, train_df["target"], n_splits=10, random_state=71, stratified=False, shuffle=True)
# stratified = False, shuffle = False
#results_nostatified_noshuffle = cross_validation(models, df, train_df["target"], n_splits=10, random_state=71, stratified=False, shuffle=False)

In [87]:
results_stratified_shuffle

Unnamed: 0,model,accuracy,precision,recall,f1,Time to fit
0,GaussianNB,0.61713,0.536519,0.801804,0.642702,9.864703


In [92]:
results_nostatified_shuffle

Unnamed: 0,model,accuracy,precision,recall,f1,Time to fit
0,LogisticRegression,0.784437,0.819804,0.629208,0.710996,10.024921
1,XGBClassifier,0.759285,0.793609,0.58118,0.670529,124.09422
2,LGBMClassifier,0.726334,0.782981,0.487209,0.600211,7.488531
3,KNeighborsClassifier,0.697505,0.857808,0.343001,0.488294,0.373013
4,RandomForestClassifier,0.578071,0.257812,0.001601,0.003183,4.343087


In [93]:
results_stratified_noshuffle

Unnamed: 0,model,accuracy,precision,recall,f1,Time to fit
0,LogisticRegression,0.752025,0.747722,0.637343,0.684628,11.99739
1,XGBClassifier,0.710646,0.720827,0.533463,0.611418,158.366493
2,LGBMClassifier,0.708177,0.754305,0.474906,0.580504,13.910943
3,KNeighborsClassifier,0.622279,0.698527,0.200263,0.30526,0.370175
4,RandomForestClassifier,0.570304,0.0,0.0,0.0,5.54898


In [94]:
results_nostatified_noshuffle

Unnamed: 0,model,accuracy,precision,recall,f1,Time to fit
0,LogisticRegression,0.767935,0.81475,0.628959,0.709135,11.220429
1,XGBClassifier,0.724196,0.797849,0.524987,0.631856,153.71115
2,LGBMClassifier,0.692965,0.789387,0.44373,0.565364,9.052463
3,KNeighborsClassifier,0.590987,0.75292,0.145909,0.241767,0.29893
4,RandomForestClassifier,0.545218,0.0,0.0,0.0,4.869191


In [10]:
# k-fold cross validation on K-Nearst Neighbors model
from sklearn.neighbors import KNeighborsClassifier

model_KNN = skl.neighbors.KNeighborsClassifier()
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=71)
i = 1
Accuracy_array = []
for train_index, test_index in kf.split(X_res, y_res):
    X_train, X_test = X_res.iloc[train_index], X_res.iloc[test_index]
    y_train, y_test = y_res.iloc[train_index], y_res.iloc[test_index]
     
    #Train the model
    model_KNN.fit(X_train, np.ravel(y_train,order='C')) #Training the model 
    
    #Evaluate the acuracy of the model
    print(f"Accuracy for the fold no. {i} on the test set: {accuracy_score(y_test, model_LR.predict(X_test))}, doublecheck: {model_LR.score(X_test,y_test)}")
    Accuracy_array.append(accuracy_score(y_test, model_LR.predict(X_test)))
    i += 1

# Compute the mean accuracy
print(f"Mean accuracy: {np.mean(Accuracy_array)}")

Accuracy for the fold no. 1 on the test set: 0.717391304347826, doublecheck: 0.717391304347826
Accuracy for the fold no. 2 on the test set: 0.7119565217391305, doublecheck: 0.7119565217391305
Accuracy for the fold no. 3 on the test set: 0.6413043478260869, doublecheck: 0.6413043478260869
Accuracy for the fold no. 4 on the test set: 0.6630434782608695, doublecheck: 0.6630434782608695
Accuracy for the fold no. 5 on the test set: 0.6630434782608695, doublecheck: 0.6630434782608695
Accuracy for the fold no. 6 on the test set: 0.6739130434782609, doublecheck: 0.6739130434782609
Accuracy for the fold no. 7 on the test set: 0.6304347826086957, doublecheck: 0.6304347826086957
Accuracy for the fold no. 8 on the test set: 0.717391304347826, doublecheck: 0.717391304347826
Accuracy for the fold no. 9 on the test set: 0.6847826086956522, doublecheck: 0.6847826086956522
Accuracy for the fold no. 10 on the test set: 0.7228260869565217, doublecheck: 0.7228260869565217
Mean accuracy: 0.682608695652174


In [16]:
# K-fold cross validation on random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report

model_RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=71)
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=71)
i = 1
Accuracy_array = []
for train_index, test_index in kf.split(X_res, y_res):
    print('Fold: {}'.format(i))
    X_train, X_test = X_res.iloc[train_index], X_res.iloc[test_index]
    y_train, y_test = y_res.iloc[train_index], y_res.iloc[test_index]

    #Train the model
    model_RF.fit(X_train, np.ravel(y_train, order='C'))
    
    #Predict the response for test dataset
    print(f"Accuracy for the fold no. {i} on the test set: {accuracy_score(y_test, model_RF.predict(X_test))}, doublecheck: {model_RF.score(X_test,y_test)}")
    Accuracy_array.append(accuracy_score(y_test, model_RF.predict(X_test)))
    i += 1
    
# Compute the mean accuracy
print(f"Mean accuracy: {np.mean(Accuracy_array)}")

Fold: 1
Accuracy for the fold no. 1 on the test set: 0.7717391304347826, doublecheck: 0.7717391304347826
Fold: 2
Accuracy for the fold no. 2 on the test set: 0.8206521739130435, doublecheck: 0.8206521739130435
Fold: 3
Accuracy for the fold no. 3 on the test set: 0.8097826086956522, doublecheck: 0.8097826086956522
Fold: 4
Accuracy for the fold no. 4 on the test set: 0.8152173913043478, doublecheck: 0.8152173913043478
Fold: 5
Accuracy for the fold no. 5 on the test set: 0.7989130434782609, doublecheck: 0.7989130434782609
Fold: 6
Accuracy for the fold no. 6 on the test set: 0.8043478260869565, doublecheck: 0.8043478260869565
Fold: 7
Accuracy for the fold no. 7 on the test set: 0.7010869565217391, doublecheck: 0.7010869565217391
Fold: 8
Accuracy for the fold no. 8 on the test set: 0.8152173913043478, doublecheck: 0.8152173913043478
Fold: 9
Accuracy for the fold no. 9 on the test set: 0.8097826086956522, doublecheck: 0.8097826086956522
Fold: 10
Accuracy for the fold no. 10 on the test set: 

In [12]:
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = skl.model_selection.train_test_split(X_res, y_res) #, stratify= y_res) if you want preserve the same proportion of class in train and test set
#X_train 

In [13]:
# Fit the LR model
LR_model = skl.linear_model.LogisticRegression(C=0.05, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='multinomial', n_jobs=None, penalty='l2', random_state=0,
                   tol=0.0001, verbose=0, warm_start=False).fit(X_train, np.ravel(y_train,order='C'))

In [16]:
# Predict the test set
y_pred = LR_model.predict(X_test)
y_pred = pd.DataFrame(y_pred, columns = ["FireMask"])
y_risk = LR_model.predict_proba(X)
 
y_risk = pd.DataFrame(y_risk, columns = ["Risk_0", "Risk_1"])
#df_toplot = pd.DataFrame([X_test, y_test, y_pred, y_risk])



In [None]:
type(y_risk.Risk_1)
y_risk = np.ravel(y_risk.Risk_1,order='C')
y_risk 

In [None]:
df['Ta_mere'] = y_risk
df

In [None]:

#ds.plot(column='Fpar_500m', cmap='coolwarm', legend=True, figsize=(10,10))
ds_toplot = df.to_xarray()
ds_toplot 


In [None]:
# plot the risk map
ds['ET_500m'][1].plot()
plt.show()

In [None]:
# Compute the confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
n = y_pred.nunique()[0]
fig, ax = plt.subplots(figsize=(10, 10))
ax.imshow(cm)
ax.grid(False)
ax.set_xlabel('Predicted outputs', fontsize=5, color='black')
ax.set_ylabel('Actual outputs', fontsize=5, color='black')
ax.xaxis.set(ticks=range(n))
ax.yaxis.set(ticks=range(n))
ax.set_ylim(n-0.5, -0.5)
for i in range(n):
    for j in range(n):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()

In [45]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.77      0.81      0.79       228
           1       0.80      0.77      0.78       232

    accuracy                           0.79       460
   macro avg       0.79      0.79      0.79       460
weighted avg       0.79      0.79      0.79       460



# KNN

In [47]:
# Fit  the KNN model
from sklearn.neighbors import KNeighborsClassifier
model_KNN = KNeighborsClassifier().fit(X_train, np.ravel(y_train,order='C'))
y_pred = model_KNN.predict(X_test)
y_pred = pd.DataFrame(y_pred, columns = ["FireMask"])


In [None]:
# Compute the confusion matrix

cm = confusion_matrix(y_test, y_pred)

n = y_pred.nunique()[0]

 
fig, ax = plt.subplots(figsize=(2*n, 2*n))
ax.imshow(cm)
ax.grid(False)
ax.set_xlabel('Predicted outputs', fontsize=5, color='black')
ax.set_ylabel('Actual outputs', fontsize=5, color='black')
ax.xaxis.set(ticks=range(n))
ax.yaxis.set(ticks=range(n))
ax.set_ylim(n-0.5, -0.5)
for i in range(n):
    for j in range(n):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()


In [None]:
# classification report
print(classification_report(y_test, y_pred))

RAndom Forest

In [None]:
# Fit the Random Forest model
from sklearn.ensemble import RandomForestClassifier
model_RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0).fit(X_train, np.ravel(y_train,order='C'))
y_pred = model_RF.predict(X_test)
y_pred = pd.DataFrame(y_pred, columns = ["FireMask"])
y_pred

In [None]:
# Compute the confusion matrix

cm = confusion_matrix(y_test, y_pred)

n = y_pred.nunique()[0]


fig, ax = plt.subplots(figsize=(2*n, 2*n))
ax.imshow(cm)
ax.grid(False)
ax.set_xlabel('Predicted outputs', fontsize=5, color='black')
ax.set_ylabel('Actual outputs', fontsize=5, color='black')
ax.xaxis.set(ticks=range(n))
ax.yaxis.set(ticks=range(n))
ax.set_ylim(n-0.5, -0.5)
for i in range(n):
    for j in range(n):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white')
plt.show()



In [None]:
# classification report
print(classification_report(y_test, y_pred))
