## FEATURE ENGINEERING 1
- Create dataframe series which specifies if location and keywords are missing
- Create a model with this columns (depending on the metrics generated)
- Create a pandas series which specifies if the text has an hashtag
- Combine with text model

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
id          7613 non-null int64
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


### CREATE DATAFRAME SERIES (empty_location and empty_keyword)

In [16]:
train["keyword"] = train["keyword"].fillna("None")
train["location"] = train["location"].fillna("None")

In [17]:
def empty_keyword(df):
    if df == "None":
        return 0
    else:
        return 1

In [18]:
train["empty_keyword"] = train["keyword"].apply(empty_keyword)
train["empty_location"] = train["location"].apply(empty_keyword)

In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 7 columns):
id                7613 non-null int64
keyword           7613 non-null object
location          7613 non-null object
text              7613 non-null object
target            7613 non-null int64
empty_keyword     7613 non-null int64
empty_location    7613 non-null int64
dtypes: int64(4), object(3)
memory usage: 416.5+ KB


In [59]:
train["keyword"].value_counts()

None                     61
fatalities               45
armageddon               42
deluge                   42
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 222, dtype: int64

In [20]:
train["empty_keyword"].value_counts()

1    7552
0      61
Name: empty_keyword, dtype: int64

In [21]:
train["empty_location"].value_counts()

1    5080
0    2533
Name: empty_location, dtype: int64

In [22]:
train["target"].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [23]:
feature_cols = ["empty_keyword", "empty_location"]

In [24]:
X = train[feature_cols]
X.head()

Unnamed: 0,empty_keyword,empty_location
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [54]:
X_eins = train["empty_location"]
X_eins = X_eins.values.reshape(-1, 1)

In [25]:
y = train["target"]

### CREATE A MODEL WITH empty_location and empty_keyword

In [52]:
from sklearn.model_selection import cross_val_score

def run_model(model):
    return cross_val_score(model, X, y, cv = 5, scoring = "accuracy").mean()

def run_model_eins(model):
    return cross_val_score(model, X_eins, y, cv = 5, scoring = "accuracy").mean()

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

lr = LogisticRegression()
nb = MultinomialNB()
rf = RandomForestClassifier()
xgb = XGBClassifier()

In [55]:
print(run_model(lr))
print(run_model_eins(lr))

0.5702089760861372
0.5703402096819377


In [56]:
print(run_model(nb))
print(run_model_eins(nb))

0.5703402096819377
0.5703402096819377


In [57]:
print(run_model(rf))
print(run_model_eins(rf))

0.5702089760861372
0.5703402096819377


In [58]:
print(run_model(xgb))
print(run_model_eins(xgb))

0.5702089760861372
0.5703402096819377


#### Conclusion: the scores are not much different from the NULL-MODEL 

### PANDAS SERIES INDICATING PRESENCE OF HASHTAGS

In [3]:
train["text"].head()

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

In [26]:
import re

def hashtag(df):
    if re.search(r'#\w+', df):
        return 1
    else:
        return 0
    
    
def empty_keyword(df):
    if df == "None":
        return 0
    else:
        return 1

def run_model(model):
    return cross_val_score(model, X, y, cv = 5, scoring = "accuracy").mean()

In [22]:
train["hashtag"] = train["text"].apply(hashtag)
train["empty_location"] = train["location"].apply(empty_keyword)

In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 7 columns):
id                7613 non-null int64
keyword           7552 non-null object
location          5080 non-null object
text              7613 non-null object
target            7613 non-null int64
hashtag           7613 non-null int64
empty_location    7613 non-null int64
dtypes: int64(4), object(3)
memory usage: 416.5+ KB


In [19]:
train["hashtag"].value_counts()

0    5870
1    1743
Name: hashtag, dtype: int64

In [None]:
train["hashtag"].head(-5)

In [24]:
feature_col = ["hashtag", "empty_location"]

In [25]:
X = train[feature_col]
y = train["target"]

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")

lr = LogisticRegression()
nb = MultinomialNB()
rf = RandomForestClassifier()
xgb = XGBClassifier()

In [32]:
run_model(nb)

0.5703402096819377

In [33]:
run_model(lr)

0.5666608404309521

In [35]:
run_model(rf)

0.5703402096819377

In [38]:
run_model(xgb)

0.5666608404309521

#### Conclusion: not much better than null model, might help when combined with the text

### CREATE A PANDAS SERIES THAT GIVES US THE LEN OF THE TEXT, COMBINE WITH HASHTAG AND MODEL

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
id          7613 non-null int64
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [20]:
import re

def text_len(df):
    return len(df.split(" "))

def hashtag(df):
    if re.search(r'#\w+', df):
        return 1
    else:
        return 0
    
def run_model(model):
    return cross_val_score(model, X, y, cv = 5, scoring = "accuracy").mean()

In [8]:
##s = "i am a boy"

In [16]:
###len(s.split(" "))

In [18]:
train["length"] = train["text"].apply(text_len)
train["hashtag"] = train["text"].apply(hashtag)

In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 7 columns):
id          7613 non-null int64
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
length      7613 non-null int64
hashtag     7613 non-null int64
dtypes: int64(4), object(3)
memory usage: 416.5+ KB


In [21]:
feature_cols = ["hashtag", "length"]

X = train[feature_cols]
y = train["target"]

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")

lr = LogisticRegression()
nb = MultinomialNB()
rf = RandomForestClassifier()
xgb = XGBClassifier()

In [23]:
run_model(lr)

0.5713900777689634

In [24]:
run_model(nb)

0.5703402096819377

In [25]:
run_model(rf)

0.5623266839242815

In [26]:
run_model(xgb)

0.5667921607607251

### COMBINE HASHTAG and  PANDAS SERIES WITH TEXT PANDAS SERIES USING FEATURE UNION