In [None]:
# Import libraries 
# Standard libraries for data processing 
import pandas as pd 
import numpy as np 
import random 
import re

# Data visualization
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator 
import matplotlib.pyplot as plt 
import seaborn as sns 
from PIL import Image 

# NLTK processing 
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import PorterStemmer

# Modeling 
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import tensorflow as tf 
import torch as pt

In [None]:
# Load data sets 
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
# How long is the data set?
print("Length of the training data set: {}\n".format(len(train)))

# Select a random sample of the data set
train.sample(6)

In [None]:
# Print data types for objects.
train.dtypes

In [None]:
#Object types should be converted to string types 
obj = ['keyword', 'location', 'text']

train[obj] = train[obj].astype(str)
test[obj] = test[obj].astype(str)

In [None]:
# How many items of each target are there?
train.groupby('target').id.count()

In [None]:
# Initiate stemmer and lemmatizer from nltk package
ps = PorterStemmer()

# Loop through data frame to lowercase keywords and stem them. 
for i in range(len(train)): 
    train.loc[i, 'keyword'] = str(re.sub('%20', ' ', train.loc[i, 'keyword'])) # For words separated by %20, replace with a blank space
    train.loc[i, 'keyword_stem'] = ps.stem(train.loc[i, 'keyword'])            # Stem words

In [None]:
# Function that takes in a data frame and target, returning a keyword pairs dictionary 
def kwDict(df, target): 
    kw = list(df[df['target']==target].keyword_stem)
    wordfreq = [kw.count(k) for k in kw]
    kw_dict = dict(list(zip(kw, wordfreq)))
    return kw_dict

In [None]:
# Generate keyword pairs for both disaster and non-disaster lists
kw1_dict = kwDict(train, 1)
kw0_dict = kwDict(train, 0)

In [None]:
# Print out first ten
print({k: kw1_dict[k] for k in list(kw1_dict)[:10]})
print({k: kw0_dict[k] for k in list(kw0_dict)[:10]})

In [None]:
# Create word clouds for both kw1 and kw0 dictionaries 
kw1_wc = WordCloud(background_color="black", width=2500, height=2500, relative_scaling=1.0).generate_from_frequencies(kw1_dict)
kw0_wc = WordCloud(background_color="black", width=2500, height=2500, relative_scaling=1.0).generate_from_frequencies(kw0_dict)

In [None]:
# Build word clouds for both targets
plt.figure(figsize=(20, 20))

# Wordcloud for Target = 1 
plt.subplot(121)
plt.imshow(kw1_wc)
plt.title('Target 1 Wordcloud')

# Wordcloud for Target = 0 
plt.subplot(122)
plt.imshow(kw0_wc)
plt.title('Target 0 Wordcloud')

plt.show()

In [None]:
# Define function to stem text
def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(ps.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [None]:
# Create function that cleans text 
def cleanText(df, Series): 
    # Removes whitespace characters and converts words to lower case 
    df[Series] = df[Series].map(lambda x: re.sub(r'\W', ' ', x).lower())
    
    # Set stopwords 
    stop_words = stopwords.words('english')
    stop_words.extend(['co', 'http', 'https', 'û_', 'via'])
    stop_words = set(stop_words)
    
    # Tokenize text 
    df['word_tokens'] = df.apply(lambda row: word_tokenize(row[Series]), axis=1)

    # Remove stopwords to filter out noise and reconnect strings into sentences. 
    df['clean_text'] = df['word_tokens'].apply(lambda x: [word for word in x if word not in stop_words])
    df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join(str(word) for word in x))

    # Stem cleaned text using stemSentence function 
    df['stemmed'] = df['clean_text'].map(lambda x: stemSentence(x))
        
    return df

In [None]:
def vectorize(vectorizer):
    vec = vectorizer.fit(train['stemmed'])
    return vec

# Function that vectorizes training and testing sets with either tf-idf or count vectorizer
def featureVec(vectorizer, df):
    vec_df = pd.DataFrame(vectorizer.transform(df['stemmed']).todense(), columns=vectorizer.get_feature_names())
    
    if len(df.columns) > 1: 
        add_df = df.reset_index()
        X_df = pd.concat([vec_df, add_df.iloc[:, 2:]], axis=1)
    else: 
        X_df = vec_df
    
    return X_df

In [None]:
def modelRun(model, param_grid): 
    random.seed(42)
    mod_cv = GridSearchCV(model, param_grid=param_grid, cv=10, n_jobs=-1, verbose=0)
    mod_cv.fit(X_train, y_train)
    
    # Print the tuned parameters and score 
    print("Tuned Logistic Regression Parameters: {}".format(mod_cv.best_params_))
    print("Best score is {}".format(round(100 * mod_cv.best_score_, 2)))
    
    # Run model with best hyperparametres; print confusion matrix. 
    y_pred = mod_cv.predict(X_test)
    print('F1-score: {}\n'.format(round(100 * f1_score(y_test, y_pred), 3)))
    print('Confusion matrix:\n {}'.format(confusion_matrix(y_test, y_pred)))
    
    return mod_cv

In [None]:
train = cleanText(train, 'text')

In [None]:
# Label encoder 
le = LabelEncoder()
kw = le.fit(train['keyword_stem'])
train['kws_le'] = kw.transform(train['keyword_stem'])

In [None]:
# Select model features, target, and train_test_split
features = ['stemmed']
target = ['target']

X_train, X_test, y_train, y_test = train_test_split(train[features], train[target], test_size=0.33, random_state=42)

In [None]:
# Count Vectorizer for data modeling 
cv = CountVectorizer(min_df=2, binary=True, encoding='utf-8', ngram_range=(1,2), stop_words='english')

# TF-IDF Vectorizer for data modeling 
tf = TfidfVectorizer(min_df=2, norm='l2', encoding='utf-8', ngram_range=(1,2), stop_words='english')

In [None]:
vec = vectorize(cv)

In [None]:
X_train = featureVec(vec, X_train)
X_test = featureVec(vec, X_test)

In [None]:
# Model with hyperparameters for tuning 
iterations = range(20000, 23001, 1000)
c_space = np.arange(0.1, 0.5, 0.1)
alpha_fit = np.arange(0.1, 1.1, 0.1)

svc_param_grid = {'max_iter':iterations, 
             'C':c_space}

mnb_param_grid = {'alpha':alpha_fit}

svc = LinearSVC(random_state=42)
mnb = MultinomialNB()

In [None]:
model = modelRun(mnb, mnb_param_grid)

In [None]:
# Next steps:
# 1. SIA for text
# 2. Length of text, word count, avg word length
# 3. Test pytorch and tensorflow NN. 

# Bonus: 
# 1. Work with pipelines 
# 2. Clean locations
# 3. Tie locations to keywords

In [None]:
for i in range(len(test)): 
    test.loc[i, 'keyword'] = str(re.sub('%20', ' ', test.loc[i, 'keyword'])) # For words separated by %20, replace with a blank space
    test.loc[i, 'keyword_stem'] = ps.stem(test.loc[i, 'keyword'])            # Stem words
    
test['kws_le'] = kw.transform(test['keyword_stem'])

In [None]:
test = cleanText(test, 'text')

In [None]:
df_test = featureVec(vec, test[features])
#df_test = pd.concat([df_test, test['kws_le'].reset_index()], axis=1)

In [None]:
test_pred = model.predict(df_test)

In [None]:
version = 7
name = 'results_v'+str(version)+'.csv'

results = pd.concat([test['id'], pd.DataFrame(test_pred)], axis=1)
results.rename(columns={0:'target'}, inplace=True)
results.to_csv(name, index=False)