In [1]:
import re
import math
import warnings
warnings.filterwarnings('ignore')
from string import punctuation
import pandas as pd
import numpy as np


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split,KFold

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
# Training and testing data are merged into single CSV and named "Whole.csv" for data cleaning purpose

In [3]:
datafile1=r'C:\Users\dattmish\Documents\IITK\Project 1\Trial_2/Whole.csv'
Complete=pd.read_csv(datafile1).drop(['Issue','Consumer complaint narrative'],1)
Issue=(pd.read_csv(datafile1)['Issue'])
narrative=(pd.read_csv(datafile1)['Consumer complaint narrative'])

# Issue and Complaint Narrative Columns were dropped to perform other operatoions and rest of the operations are kept same as in benchmark script

In [4]:
for col in ['Date received','Date sent to company']:
    Complete[col]=pd.to_datetime(Complete[col],infer_datetime_format=True)
Complete['day_diff']=pd.to_numeric(Complete['Date sent to company']-Complete['Date received'])

for col in ['Date received','Date sent to company']:
    Complete.drop([col],1,inplace=True)
    
for col in ['Sub-product','Sub-issue','Company public response','Tags','Consumer consent provided?']:
    varname=col.replace('-','_').replace('?','').replace(" ",'_')+'_isNan'
    Complete[varname]=np.where(pd.isnull(Complete[col]),1,0)
    Complete.drop([col],1,inplace=True)

for col in ['ZIP code','Company']:
    Complete.drop([col],1,inplace=True)

k=Complete['State'].value_counts()
for val in k.axes[0][0:15]:
    varname='State_'+val.replace(',','_').replace(' ','_')
    Complete[varname]=np.where(Complete['State']==val,1,0)
del Complete['State']

for col in ['Product','Submitted via','Company response to consumer','Timely response?']:    
    temp=pd.get_dummies(Complete[col],prefix=col,drop_first=True)
    Complete=pd.concat([temp,Complete],1)
    Complete.drop([col],1,inplace=True)

Complete['Consumer disputed?']=np.where(Complete['Consumer disputed?']=="Yes",1,0)
# Complete.dtypes
# Complete.shape

# TFIDF for Issue & Narrative Columns

In [5]:
eng_stopwords = set(stopwords.words('english')+list(punctuation))

# lem = WordNetLemmatizer()
# stem = PorterStemmer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [6]:
Issue=Issue.str.lower()
Issue = Issue.apply(lambda x: ' '.join([word for word in x.split() if word not in (eng_stopwords)]))
Issue = Issue.apply(lemmatize_text)

In [7]:
narrative=narrative.str.lower()
narrative=narrative.fillna("")
narrative = narrative.apply(lambda x: ' '.join([word for word in x.split() if word not in (eng_stopwords)]))
narrative = narrative.apply(lemmatize_text)

In [8]:
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(max_features=300,analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun,token_pattern=None, use_idf=True) 

In [9]:
Narrative_vect=tfidf.fit_transform(narrative)
Issue_vect=tfidf.fit_transform(Issue)

In [10]:
# making single dataframe for all the processed data
Final=pd.concat([pd.DataFrame(Issue_vect.toarray()), pd.DataFrame(Narrative_vect.toarray()),Complete], axis = 1)

In [11]:
print(Narrative_vect.shape)
print(Issue_vect.shape)
print(Complete.shape)
print(Final.shape)

(598027, 300)
(598027, 167)
(598027, 46)
(598027, 513)


# bifurcating the processed data into original data and fitting the model 



In [12]:
# seperating original training and testing sets into cd_train and CD_test
cd_test=Final.iloc[478421:,:]
cd_test=cd_test.drop(['Consumer disputed?'],1)

In [13]:
cd_train=Final.iloc[:478421,:]
x=cd_train.drop(['Consumer disputed?','Complaint ID'],1)
y=cd_train['Consumer disputed?']
print(cd_train.shape)
print(cd_test.shape)

(478421, 513)
(119606, 512)


In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 18)

# RandomForest

In [15]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)

# fit the model to the training set

rf_clf.fit(x_train, y_train)

# Predict on the test set results

y_pred = rf_clf.predict(x_test)

In [16]:
# Check accuracy score 

auc_score1 = roc_auc_score(y_test, y_pred)

print('Model accuracy score with 100 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
print('ROC AUC Score is : {0:0.6f}'.format(auc_score1))

Model accuracy score with 100 decision-trees : 0.7704
ROC AUC Score is : 0.511019


In [17]:
# prediction=np.where(rf_clf.predict(cd_test.drop(['Complaint ID'],1))==1,"Yes","No")
# submission=pd.DataFrame(list(zip(cd_test['Complaint ID'],list(prediction))),
#                        columns=['Complaint ID','Consumer disputed?'])
# submission.to_csv(' Submission 2.csv',index=False)

# LogisticRegression

In [18]:
clf=LogisticRegression()

In [19]:
clf.fit(x_train, y_train)

LogisticRegression()

In [20]:
y_pred_LogisticR = clf.predict(x_test)

In [21]:
auc_score2 = roc_auc_score(y_test, y_pred_LogisticR)

print('Model accuracy score with 100 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred_LogisticR)))
print('ROC AUC Score is : {0:0.6f}'.format(auc_score2))

Model accuracy score with 100 decision-trees : 0.7902
ROC AUC Score is : 0.500000


In [22]:
# prediction=np.where(clf.predict(cd_test.drop(['Complaint ID'],1))==1,"Yes","No")
# submission=pd.DataFrame(list(zip(cd_test['Complaint ID'],list(prediction))),
#                        columns=['Complaint ID','Consumer disputed?'])
# submission.to_csv(' Submission 3.csv',index=False)