# Life cycle of a Data Science Project
- ~~Gather Data~~
- ~~Exploratory data analysis~~
- Feature Engineering 
- Feature Scaling 
- Model Creation
- Hyperpatameter tuning
- Model Deployment
- Model Training

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [2]:
trainDf = pd.read_csv('train.csv')

In [3]:
trainDf.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
trainDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
testDf = pd.read_csv('test.csv')

In [6]:
testDf.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
trainDf.shape

(7613, 5)

In [8]:
testDf.shape

(3263, 4)

In [9]:
totalDf = pd.concat([trainDf, testDf], axis=0)

In [10]:
totalDf.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1.0
1,4,,,Forest fire near La Ronge Sask. Canada,1.0
2,5,,,All residents asked to 'shelter in place' are ...,1.0
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1.0


In [11]:
totalDf.shape

(10876, 5)

In [12]:
totalDf.isnull().sum()

id             0
keyword       87
location    3638
text           0
target      3263
dtype: int64

In [13]:
trainDf.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [14]:
testDf.isnull().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

In [15]:
len(totalDf['location'].unique())

4522

In [16]:
len(totalDf['keyword'].unique())

222

In [17]:
totalDf['location'].replace('World Wide!!', 'Worldwide', inplace=True)
totalDf['location'].replace('London, UK', 'UK', inplace=True)

topKeyword = totalDf.keyword.value_counts().sort_values(ascending=False).sample(n=10, random_state=33).to_dict()

type(topKeyword)

topKeyword

In [18]:
topLocation = totalDf.location.value_counts().sort_values(ascending=False).head(10).to_dict()
topLocation

{'USA': 141,
 'New York': 109,
 'United States': 65,
 'London': 58,
 'UK': 46,
 'Canada': 42,
 'Nigeria': 40,
 'Worldwide': 36,
 'India': 35,
 'Los Angeles, CA': 34}

In [19]:
type(topLocation)

dict

In [20]:
def handleCat(topCol):
    for tl in topCol.keys():
        totalDf[tl] = np.where(totalDf['location'] == tl, 1, 0)

In [21]:
handleCat(topLocation)
# handleCat(topKeyword)

In [22]:
totalDf[24:38]

Unnamed: 0,id,keyword,location,text,target,USA,New York,United States,London,UK,Canada,Nigeria,Worldwide,India,"Los Angeles, CA"
24,36,,,LOOOOOOL,0.0,0,0,0,0,0,0,0,0,0,0
25,37,,,No way...I can't eat that shit,0.0,0,0,0,0,0,0,0,0,0,0
26,38,,,Was in NYC last week!,0.0,0,0,0,0,0,0,0,0,0,0
27,39,,,Love my girlfriend,0.0,0,0,0,0,0,0,0,0,0,0
28,40,,,Cooool :),0.0,0,0,0,0,0,0,0,0,0,0
29,41,,,Do you like pasta?,0.0,0,0,0,0,0,0,0,0,0,0
30,44,,,The end!,0.0,0,0,0,0,0,0,0,0,0,0
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1.0,0,0,0,0,0,0,0,0,0,0
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0.0,0,0,0,0,0,0,0,0,0,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1.0,0,0,0,0,0,0,0,0,0,0


In [23]:
totalDf.drop(['id', 'location'], axis=1, inplace=True)

In [24]:
totalDf.head()

Unnamed: 0,keyword,text,target,USA,New York,United States,London,UK,Canada,Nigeria,Worldwide,India,"Los Angeles, CA"
0,,Our Deeds are the Reason of this #earthquake M...,1.0,0,0,0,0,0,0,0,0,0,0
1,,Forest fire near La Ronge Sask. Canada,1.0,0,0,0,0,0,0,0,0,0,0
2,,All residents asked to 'shelter in place' are ...,1.0,0,0,0,0,0,0,0,0,0,0
3,,"13,000 people receive #wildfires evacuation or...",1.0,0,0,0,0,0,0,0,0,0,0
4,,Just got sent this photo from Ruby #Alaska as ...,1.0,0,0,0,0,0,0,0,0,0,0


In [25]:
totalDf.isnull().sum()

keyword              87
text                  0
target             3263
USA                   0
New York              0
United States         0
London                0
UK                    0
Canada                0
Nigeria               0
Worldwide             0
India                 0
Los Angeles, CA       0
dtype: int64

In [26]:
totalDf.shape

(10876, 13)

In [27]:
finalTrain = totalDf.iloc[:-3263,:]

In [28]:
finalTrain.shape

(7613, 13)

In [29]:
finalTrain.tail()

Unnamed: 0,keyword,text,target,USA,New York,United States,London,UK,Canada,Nigeria,Worldwide,India,"Los Angeles, CA"
7608,,Two giant cranes holding a bridge collapse int...,1.0,0,0,0,0,0,0,0,0,0,0
7609,,@aria_ahrary @TheTawniest The out of control w...,1.0,0,0,0,0,0,0,0,0,0,0
7610,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1.0,0,0,0,0,0,0,0,0,0,0
7611,,Police investigating after an e-bike collided ...,1.0,0,0,0,0,0,0,0,0,0,0
7612,,The Latest: More Homes Razed by Northern Calif...,1.0,0,0,0,0,0,0,0,0,0,0


In [30]:
finalTest = totalDf.iloc[-3263:,:]

In [31]:
finalTest.head()

Unnamed: 0,keyword,text,target,USA,New York,United States,London,UK,Canada,Nigeria,Worldwide,India,"Los Angeles, CA"
0,,Just happened a terrible car crash,,0,0,0,0,0,0,0,0,0,0
1,,"Heard about #earthquake is different cities, s...",,0,0,0,0,0,0,0,0,0,0
2,,"there is a forest fire at spot pond, geese are...",,0,0,0,0,0,0,0,0,0,0
3,,Apocalypse lighting. #Spokane #wildfires,,0,0,0,0,0,0,0,0,0,0
4,,Typhoon Soudelor kills 28 in China and Taiwan,,0,0,0,0,0,0,0,0,0,0


In [32]:
finalTrain.to_csv('finalTrain.csv', index=False)
finalTest.to_csv('finalTest.csv', index=False)