In [1]:
import pandas as pd
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
train=train.replace('\n',' ')
test=test.replace('\n',' ')

In [3]:
class nlp:
    #need to download from nltk - 'averaged_perceptron_tagger','wordnet','stopwords'
    re = __import__('re')
    pd = __import__('pandas')
    nltk = __import__('nltk')
    np = __import__('numpy')
    
    def __init__(self):
        print('nlp V.0.1 \nImported pandas,re,nltk,numpy packages')
        
    def clean_text_for_topic_modelling(self,text_column):
        text_column=text_column.str.lower()
#         text_column = [self.re.sub(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b','', x) for x in text_column]
        text_column = [x.split() for x in text_column]
        temp_corpus=[]
        for tweet in text_column:
            new_list=[w for w in tweet if len(w)>2]
            temp_corpus.append(new_list)
        return self.pd.Series(temp_corpus)
    
    def remove_stopwords_from_corpus(self,text_column,extra_stopwords_list=[]):
        from nltk.corpus import stopwords
        stops = set(stopwords.words('english')).union(extra_stopwords_list)
        text_column=text_column.apply(lambda x: [w for w in x if w not in stops])
        return text_column
    
    def stemm_corpus(self,text_column,stemmer='porter'):
        if stemmer=='porter':
            from nltk.stem import PorterStemmer
            porter = PorterStemmer()
            text_column=text_column.apply(lambda x:[porter.stem(w) for w in x])
        elif stemmer=='lancaster':
            from nltk.stem import LancasterStemmer
            lancaster=LancasterStemmer()
            text_column=text_column.apply(lambda x:[lancaster.stem(w) for w in x])
        else:
            from nltk.stem import SnowballStemmer
            snowball=SnowballStemmer("english")
            text_column=text_column.apply(lambda x:[snowball.stem(w) for w in x])
        return text_column
    
    def lemmatize_corpus(self,text_column):
        from nltk.stem import WordNetLemmatizer
        from nltk.corpus import wordnet
        
        def get_word_pos(word):
            tag = self.nltk.pos_tag(word)[0][1][0].upper()
            tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
            return tag_dict.get(tag, wordnet.NOUN)
        
        wordnet_lemmatizer = WordNetLemmatizer()
        text_column=text_column.apply(lambda x:[wordnet_lemmatizer.lemmatize(w, pos=get_word_pos(w)) for w in x])
        return text_column
    
    def create_ngrams(self,text_column,replace=False,num_grams=2):
        from nltk import ngrams
        if replace:
            grams_list=[]
            for tweet in text_column:
                grams_list.append([' '.join(ngram) for ngram in ngrams(tweet,num_grams)])
            return self.pd.Series(grams_list)
        else:
            for i,tweet in text_column.iteritems():
                copy_tweet=tweet.copy()
                for ngram in ngrams(tweet,2):
                    copy_tweet.append(' '.join(ngram))
                text_column.at[i]=copy_tweet
            return text_column
        
    def gensim_dic(self,text_column,filter_extremes=True,no_below=3,no_above=0.99):
        from gensim.corpora import Dictionary
        dictionary=Dictionary(text_column)
        if filter_extremes:
            dictionary.filter_extremes(no_below=no_below, no_above=no_above)
        return dictionary
    
    def bow_corpus(self,dictionary,text_column):
        return [dictionary.doc2bow(doc) for doc in text_column]
    
    def one_hot_encoding_todf(self,dictionary,text_column):
        from tqdm import tqdm
        columns=[dictionary.get(i) for i in range(len(dictionary))]
        one_hot_df=pd.DataFrame(columns=columns)
        for i,row in tqdm(enumerate(train.comment_text)):
            for word in row:
                one_hot_df.at[i,word]=+1
        return one_hot_df
    
    def tf_idf_tomatrix(self,dictionary,tf_idf_object):
        from gensim.matutils import corpus2dense
        num_terms = len(dictionary.keys())
        num_docs = dictionary.num_docs
        corpus_tfidf_dense = corpus2dense(tf_idf_object, num_terms, num_docs)
        return self.pd.DataFrame(self.np.transpose(corpus_tfidf_dense))
        
    def vec_to_tfidf(self,bow_corpus):
        from gensim import models
        tfidf = models.TfidfModel(bow_corpus)
        return tfidf[bow_corpus]
    
    def lda_model(self,tfidf_corp,dic,num_topics=20,passes=30,alpha=0.001,eta='auto'):
        from gensim import models
        return models.ldamodel.LdaModel(corpus=tfidf_corp, num_topics=num_topics, id2word=dic, passes=passes, alpha=alpha, 
                                             eta=eta,random_state=13)
    
    def get_coherence(self,lda_model,corpus,dic):
        from gensim.models import CoherenceModel
        coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus, dictionary=dic, coherence='c_v')
        return coherence_model_lda.get_coherence()
    
    def is_tweet_intopic(self,text_column,words_list):
        
        def is_tweets_words_inlist(tweet,words_list):
            for word in tweet:
                if word in words_list:
                    return True
            return False
        
        results=[]
        for tweet in text_column:
            results.append(is_tweets_words_inlist(tweet,words_list))
        return self.pd.Series(results)
    
    def assign_lda_topic(self,lda_model,bow_corpus):
        topics=[]
        data=lda_model.get_document_topics(bow_corpus)
        for tweet in data:  
            df = self.pd.DataFrame(tweet, columns=['topic_num', 'probability'])
            topics.append(df.loc[df['probability'].idxmax(), 'topic_num'])
        return (self.pd.Series(topics))
    
    def get_sentiment(self,text_column):
        from textblob import TextBlob
        return text_column.apply(lambda x: TextBlob(' '.join(x)).sentiment.polarity)
    
    def get_length_of_comment(self,text_column):
        return text_column.apply(lambda x: len(x))
    
    def amount_of_upper(self,text_column):
        return text_column.apply(lambda x: sum(1 for c in x if c.isupper()))
    
    def create_uppercase_max_sequence_column(self,text_column):
        def get_max_uppercase_run_from_string(s):
            lengths=[len(x) for x in self.re.findall(r"[A-Z]+", s)]
            if len(lengths)>0:
                return max(lengths)
            else:
                return 0
        return (text_column.apply(lambda x: get_max_uppercase_run_from_string(x)))
    
    def amount_of_sign(self,text_column,sign):
        return text_column.apply(lambda x: sum(1 for c in x if c==sign))
    
    def max_sequence(self,text_column,symbol):
        def get_max_sequence_run_from_string(s,symbol):
            lengths=[len(x) for x in self.re.findall(r"["+symbol+"]+", s)]
            if len(lengths)>0:
                return max(lengths)
            else:
                return 0   
        return text_column.apply(lambda x: get_max_sequence_run_from_string(x,symbol))
    
    def remove_punctuation(self,text_column):
        import string
        return text_column.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    
    def bad_comments_column_bin(self,text_column,bad_words_list):
        def is_comment_bad(comment,bad_words_list):
            for word in comment:
                if word in bad_words_list:
                    return 1
            return 0
        return text_column.apply(lambda x: is_comment_bad(x,bad_words_list))

    def bad_comments_column_agg(self,text_column,bad_words_list):
        def amount_comment_bad_words(comment,bad_words_list):
            count_of_bad_words=0
            for word in comment:
                if word in bad_words_list:
                    count_of_bad_words=count_of_bad_words+1
            return count_of_bad_words
        return text_column.apply(lambda x: amount_comment_bad_words(x,bad_words_list))

In [4]:
nlp=nlp()
X=pd.DataFrame()
train_test_text=train.comment_text.append(test.comment_text,ignore_index=True)
X['length']=nlp.get_length_of_comment(train_test_text)
X['amount_upper']=nlp.amount_of_upper(train_test_text)
X['max_upper']=nlp.create_uppercase_max_sequence_column(train_test_text)
X['sentiment']=nlp.get_sentiment(train_test_text)
# X['amount_question_marks']=nlp.amount_of_sign(train_test_text,'?')
X['amount_exclamations']=nlp.amount_of_sign(train_test_text,'!')
X['amount_stars']=nlp.amount_of_sign(train_test_text,'*')
X['amount_dashes']=nlp.amount_of_sign(train_test_text,'-')
X['amount_equal_signs']=nlp.amount_of_sign(train_test_text,'=')
# X['max_seq_question_marks']=nlp.max_sequence(train_test_text,'?')
X['max_seq_exclamations']=nlp.max_sequence(train_test_text,'!')
X['max_seq_stars']=nlp.max_sequence(train_test_text,'*')
X['max_seq_dashes']=nlp.max_sequence(train_test_text,'-')
train_test_text=nlp.remove_punctuation(train_test_text)
train_test_text=nlp.clean_text_for_topic_modelling(train_test_text)
train_test_text=nlp.remove_stopwords_from_corpus(train_test_text)
# train_test_text=nlp.lemmatize_corpus(train_test_text)
train_test_text=nlp.stemm_corpus(train_test_text,stemmer='snowball')
train_test_text=nlp.create_ngrams(train_test_text,num_grams=2)
bad_words=pd.read_csv('bad_words.csv',names=['bad_words'])
bad_words=nlp.stemm_corpus(bad_words,stemmer='snowball')
bad_words=list(bad_words['bad_words'])
X['bad_words_bin']=nlp.bad_comments_column_bin(train_test_text,bad_words)
X['bad_words_agg']=nlp.bad_comments_column_agg(train_test_text,bad_words)

nlp V.0.1 
Imported pandas,re,nltk,numpy packages


In [5]:
dic=nlp.gensim_dic(train_test_text,no_below=11,no_above=0.8)
bow_corp=nlp.bow_corpus(dic,train_test_text)
tfidf=nlp.vec_to_tfidf(bow_corp)
lda_model = nlp.lda_model(tfidf,dic)
X['lda_topic'] = nlp.assign_lda_topic(lda_model,bow_corp)
tfidf_df=nlp.tf_idf_tomatrix(dic,tfidf)
scaler = StandardScaler()
X=pd.DataFrame(scaler.fit_transform(X),columns=X.columns)
X=X.join(tfidf_df)
X_train=X.iloc[:10000].copy()
X_test=X.iloc[10000:].copy()
y_toxic=train['toxic'].copy()
y_obscene=train['obscene'].copy()

In [6]:
X_train_2, X_validate, y_train, y_validate = train_test_split(X_train, y_obscene, test_size=0.25, random_state=22)
clf = LogisticRegression(penalty='l1',solver='liblinear',C=0.75)
clf.fit(X_train_2, y_train)
f1_score(y_validate, clf.predict(X_validate), average='weighted')

0.9084678344069671

In [None]:
X_train_2, X_validate, y_train, y_validate = train_test_split(X_train, y_obscene, test_size=0.25, random_state=4)
rf = RandomForestClassifier(max_depth=100, random_state=0,n_estimators=1000)
rf.fit(X_train_2, y_train)
f1_score(y_validate, rf.predict(X_validate), average='weighted')

In [7]:
clf.fit(X_train, y_toxic)
toxic_results=clf.predict(X_test)



In [12]:
clf.fit(X_train, y_obscene)
obscene_results=clf.predict(X_test)

In [13]:
try14=test.join(pd.DataFrame(toxic_results,columns=['toxic']))
try14=try14.join(pd.DataFrame(obscene_results,columns=['obscene']))
try14.drop('comment_text',1,inplace=True)

In [14]:
try14.to_csv('try14.csv',index=False)

In [None]:
coefs=pd.DataFrame(clf.coef_,columns=X.columns).T