**Import datasets and libraries**

In [65]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from textblob import TextBlob
from textblob import Word

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [66]:
train=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [67]:
train=train.drop(['id','keyword','location'],axis=1)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7613 non-null   object
 1   target  7613 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 119.1+ KB


In [69]:
test=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv').drop(['id','keyword','location'],axis=1)
test.head()

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan


In [70]:
train.info(),test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7613 non-null   object
 1   target  7613 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 119.1+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    3263 non-null   object
dtypes: object(1)
memory usage: 25.6+ KB


(None, None)

**Create two functions (one nested) that does the cleaning and processing the data**

In [71]:
def processRow(row):
    
    tweet = row
    #Lower case
    tweet.lower()
    #Removes unicode strings like "\u002c" and "x96"
    tweet = re.sub(r'(\\u[0-9A-Fa-f]+)'," ", tweet)
    tweet = re.sub(r'[^\x00-\x7f]'," ",tweet)
    #convert any url to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert any @Username to "AT_USER"
    tweet = re.sub('@[^\s]+','AT_USER',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    tweet = re.sub('[\n]+', ' ', tweet)
    #Remove not alphanumeric symbols white spaces
    tweet = re.sub(r'[^\w]', ' ', tweet)
    #Removes hastag in front of a word """
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #Remove :( or :)
    tweet = tweet.replace(':)'," ")
    tweet = tweet.replace(':('," ")
    #remove numbers
    tweet = "".join([i for i in tweet if not i.isdigit()])
    #remove multiple exclamation
    tweet = re.sub(r"(\!)\1+", ' ', tweet)
    #remove multiple question marks
    tweet = re.sub(r"(\?)\1+", ' ', tweet)
    #remove multistop
    tweet = re.sub(r"(\.)\1+", ' ', tweet)
    #lemma
    from textblob import Word
    tweet =" ".join([Word(word).lemmatize() for word in tweet.split()])
    tweet = tweet.strip('\'"')
    #lowercase
    tweet = tweet.lower()
    tweet = tweet.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    tweet = [ps.stem(word) for word in tweet if not word in set(all_stopwords)]
    tweet = ' '.join(tweet)
    
    
    row = tweet
    return row

def processData(dataset,predictor):
    
    corpus=[]
    for i in range(0,len(dataset)):
        tweet = processRow(predictor[i])
        corpus.append(tweet)
    return corpus

corpus_train=processData(train,train['text'])
corpus_test=processData(test,test['text'])

**Use CountVectorizer to create a matrix of token counts. One for training and one for testing**

In [72]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 3000)

X = cv.fit_transform(corpus_train).toarray()
y=train.iloc[:,-1].values


Xt=cv.transform(corpus_test).toarray()

X.shape,y.shape,Xt.shape

((7613, 3000), (7613,), (3263, 3000))

In [73]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

X_train.shape,X_test.shape,y_train.shape,y_test.shape

((6090, 3000), (1523, 3000), (6090,), (1523,))

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
xgb1 = xgb.XGBRegressor()
parameters = {'objective':['reg:squarederror'],
              'booster':['gbtree'],
              'learning_rate': [0.05,0.2,0.25,0.3], #so called `eta` value
              'max_depth': [5, 8, 10,12],
              'n_estimators': [100]}

#Use gridsearch with 2 folds for finding optimal parameters and vlaues
xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 2,
                        n_jobs = 2,
                        verbose=True)

xgb_grid.fit(X_train,y_train)
print(xgb_grid.best_params_)

#output: booster= 'gbtree', learning_rate= 0.3, max_depth= 5, n_estimators= 100, objective= 'reg:squarederror' 

In [75]:
# Instantiate an XGBoost object with hyperparameters
xgb_reg = xgb.XGBRegressor(booster= 'gbtree', learning_rate= 0.3, max_depth= 5, n_estimators= 100, objective= 'reg:squarederror')
# Train the model with train data sets
xgb_reg.fit(X_train, y_train)

y_pred = xgb_reg.predict(X_test) # Predictions

y_pred[y_pred<0.5]=0
y_pred[y_pred>0.5]=1
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))

[[807  79]
 [218 419]]
0.8049901510177282


In [76]:
y_pred_test=xgb_reg.predict(Xt)

y_pred_test[y_pred_test<0.5]=0
y_pred_test[y_pred_test>0.5]=1

y_pred_test=y_pred_test.astype(int)


test_output=pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
output = pd.DataFrame({'id': test_output.id, 'target': y_pred_test.astype('str')})
output.to_csv("nlp_submission.csv", index=False)
print('Submission done!')

Submission done!
