In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Data/train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
df.loc[0]['text']

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [4]:
from io import StringIO
col = ['target', 'text']
df = df[col]
df.tail()

Unnamed: 0,target,text
7608,1,Two giant cranes holding a bridge collapse int...
7609,1,@aria_ahrary @TheTawniest The out of control w...
7610,1,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611,1,Police investigating after an e-bike collided ...
7612,1,The Latest: More Homes Razed by Northern Calif...


In [5]:
df.columns = ['target', 'text']
target_df = df[['text', 'target']].drop_duplicates().sort_values('target')
target_dict = dict(target_df.values)
df.head()

Unnamed: 0,target,text
0,1,Our Deeds are the Reason of this #earthquake M...
1,1,Forest fire near La Ronge Sask. Canada
2,1,All residents asked to 'shelter in place' are ...
3,1,"13,000 people receive #wildfires evacuation or..."
4,1,Just got sent this photo from Ruby #Alaska as ...


In [6]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,6))
df.groupby('target').text.count().plot.bar(ylim=0)
plt.show()

<Figure size 800x600 with 1 Axes>

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.text).toarray()
labels = df.target
features.shape

(7613, 3623)

In [8]:
df.head()
sorted(target_dict)[0]

'! Residents Return To Destroyed Homes As Washington Wildfire Burns on http://t.co/UcI8stQUg1'

In [10]:
from sklearn.feature_selection import chi2
N = 2
for target in [0,1]:
  features_chi2 = chi2(features, labels == target)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(target))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# '0':
  . Most correlated unigrams:
. hiroshima
. california
  . Most correlated bigrams:
. suicide bombing
. suicide bomber
# '1':
  . Most correlated unigrams:
. hiroshima
. california
  . Most correlated bigrams:
. suicide bombing
. suicide bomber


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [12]:
test = pd.read_csv('Data/test.csv')

In [15]:
test.iloc[0].text

'Just happened a terrible car crash'

In [16]:
print(clf.predict(count_vect.transform([test.iloc[0].text])))

[1]


In [27]:
test.head(1).text[0]

'Just happened a terrible car crash'

In [29]:
test["prediction"] = test.apply(lambda row : clf.predict(count_vect.transform([row['text']])), axis = 1)

In [37]:
answerDf = pd.DataFrame()
answerDf = test[['id', 'prediction']]
answerDf.columns = ['id', 'target']
answerDf['target'] = answerDf.apply(lambda row : row['target'][0], axis=1)
answerDf.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1


In [38]:
answerDf.to_csv('submission1.csv', index= False)