### Importing the libraries

In [34]:
import pandas as pd
import numpy as np
import re
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer
from wordcloud import WordCloud, STOPWORDS
import string
from textblob import TextBlob

import spacy
from spacy.language import Language
from spacy_language_detection import LanguageDetector

from collections import Counter
import string
# from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.base import BaseEstimator, TransformerMixin


import warnings
warnings.filterwarnings("ignore")
PUNCTUATION = string.punctuation

In [2]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
spacy.load('en_core_web_sm')

<spacy.lang.en.English at 0x1f515c0da90>

### Reading the csv file

In [3]:
df = pd.read_csv("twitter.csv")
df.head()

Unnamed: 0,status_id,account.type,tweets
0,1208265880146046976,bot,YEA now that note GOOD
1,1091463908118941696,human,Listen to This Charming Man by The Smiths htt...
2,1199055191028293633,bot,たぶんあの時からわたしは……そなたが
3,1214698264701722626,bot,The decade in the significantly easier schedul...
4,1209229478934695937,bot,"""Theim class=\""alignnone size-full wp-image-60..."


In [4]:
df.shape

(10000, 3)

In [5]:
df.columns

Index(['status_id', 'account.type', 'tweets'], dtype='object')

In [6]:
df.dtypes

status_id       object
account.type    object
tweets          object
dtype: object

#### Checking for null values

In [7]:
df.isnull().sum()

status_id       0
account.type    0
tweets          4
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.isnull().sum().sum()

0

In [10]:
df = df.rename(columns = {"account.type":"account_type"})

In [11]:
data = df.copy()

In [12]:
data.account_type.value_counts()

account_type
human    5002
bot      4994
Name: count, dtype: int64

#### text normalization

In [13]:
data["tweets"] = data["tweets"].str.lower()

In [14]:
data.head()

Unnamed: 0,status_id,account_type,tweets
0,1208265880146046976,bot,yea now that note good
1,1091463908118941696,human,listen to this charming man by the smiths htt...
2,1199055191028293633,bot,たぶんあの時からわたしは……そなたが
3,1214698264701722626,bot,the decade in the significantly easier schedul...
4,1209229478934695937,bot,"""theim class=\""alignnone size-full wp-image-60..."


In [15]:
def simplify(q):
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }
    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    return q

In [16]:
data["tweets"] = data["tweets"].apply(lambda i: simplify(i))

#### Detecting non english languages

In [17]:
def get_lang_detector(nlp, name):
    return LanguageDetector(seed=42)  # We use the seed 42

nlp_model = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
nlp_model.add_pipe('language_detector', last=True)

def Language_Detection(txt):
    doc = nlp_model(txt)
    return (doc._.language['language'])

In [18]:
data['lan'] = data['tweets'].apply(lambda x: Language_Detection(x))

In [19]:
inx = data[data['lan']!='en'].index
data.drop(inx, inplace = True)

In [20]:
data['account_type'] = data['account_type'].map({'human': 1, 'bot': 0})

In [21]:
train_df, test_df = train_test_split(data, test_size=0.1)

In [22]:
train_data = train_df['tweets']
train_target = train_df['account_type']

test_data = test_df['tweets']
test_target = test_df['account_type']

#### Tf-Idf pipeline

In [23]:
bow_pipeline = Pipeline(
    steps=[
        ("tfidf", TfidfVectorizer()),
        ("classifier", RandomForestClassifier()),
    ]
)
bow_pipeline.fit(train_data, train_target)
y_pred = bow_pipeline.predict(test_data)
cr = classification_report(test_target, y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.65      0.77      0.71       353
           1       0.71      0.58      0.64       345

    accuracy                           0.68       698
   macro avg       0.68      0.68      0.67       698
weighted avg       0.68      0.68      0.67       698



#### Word embeddings pipeline

In [24]:
import spacy 
nlp = spacy.load("en_core_web_sm")  # this model will give you 300D
class SpacyVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, nlp):
        self.nlp = nlp
        self.dim = 300

    def fit(self, X, y):
        return self

    def transform(self, X):
        # Doc.vector defaults to an average of the token vectors.
        # https://spacy.io/api/doc#vector
        return [self.nlp(text).vector for text in X]


In [25]:
embeddings_pipeline = Pipeline(
    steps=[
        ("mean_embeddings", SpacyVectorTransformer(nlp)),
        ("classifier", RandomForestClassifier()),
    ]
)
embeddings_pipeline.fit(train_data, train_target)
y_pred = embeddings_pipeline.predict(test_data)
cr = classification_report(test_target, y_pred)
cr

'              precision    recall  f1-score   support\n\n           0       0.67      0.70      0.68       353\n           1       0.68      0.65      0.66       345\n\n    accuracy                           0.67       698\n   macro avg       0.67      0.67      0.67       698\nweighted avg       0.67      0.67      0.67       698\n'

#### Custom Pipeline

In [26]:
class SegmentFeaturizer:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.future_words = ["tomorrow", "future", "futures"]
        
    @staticmethod
    def getCharCount(doc):
#         print(len(doc.text))
        return len(doc.text)
    
    @staticmethod
    def getWordsInSen(doc):
        res = sum([i.strip(string.punctuation).isalpha() for i in doc.text.split()])
        return res
    
    @staticmethod
    def noSen(doc):
        return len(nltk.sent_tokenize(doc.text))
    
    @staticmethod
    def noUniqueWord(doc):
        return len(set(doc.text.split()))
    
    @staticmethod
    def avgWordLenInSen(doc):
        words = [word for word in doc.text.split() if word]
        if (len(words) != 0):
            avg = sum(map(len, words))/len(words)
            return avg
        else:
            return 0
        
    @staticmethod
    def avgSenLen(doc):
        if (SegmentFeaturizer.noSen(doc) != 0):
            return (SegmentFeaturizer.getWordsInSen(doc) / SegmentFeaturizer.noSen(doc))
        else:
            return 0

    @staticmethod
    def count_pronouns(doc):
        segment = doc.text.lower().split()
        counter = {"1sg": 0, "1pl": 0}
        for pronoun in FIRST_SINGULAR:
            counter["1sg"] += segment.count(pronoun)
        for pronoun in FIRST_PLURAL:
            counter["1pl"] += segment.count(pronoun)
        return counter

    @staticmethod
    def getWordInDoubleQuote(doc):
        return len(re.findall(r'["][\w\s]+["]', doc.text))
    
    @staticmethod
    def getNoStopWords(doc):
        return len([x for x in nltk.word_tokenize(doc.text) if x in STOPWORDS])


    @staticmethod
    def getPunctuationcount(doc):
        return len("".join(x for x in doc.text if x in PUNCTUATION))
    
    @staticmethod
    def getNoPositiveWords(doc):
        return len([i for i in doc.text.split() if TextBlob(i).sentiment.polarity >= 0.5])
    
    @staticmethod
    def getNoNegativeWords(doc):
        return len([i for i in doc.text.split() if TextBlob(i).sentiment.polarity >= 0.5])

    @staticmethod
    def get_n_words_before_main_verb(doc):
        numbers = [0]
        for sent in doc.sents:
            main = [t for t in sent if t.dep_ == "ROOT"][0]
            if main.pos_ == "VERB":
                dist_to_init = main.i - sent[0].i
                numbers.append(dist_to_init)
        return np.mean(numbers)

    @staticmethod
    def get_n_complex_clauses(doc):
        embedded_elements_count = []
        for sent in doc.sents:
            n_embedded = len(
                [t for t in sent if t.dep_ in {"ccomp", "xcomp", "advcl", "dative"}]
            )
            embedded_elements_count.append(n_embedded)
        return np.mean(embedded_elements_count)
    
    # putting it all together!
    def featurize(self, segments):
        feature_dicts = []
        docs = self.nlp.pipe(segments)
        for doc in docs:
            feature_dict = {
                'charCount': self.getCharCount(doc),
                'getWordsInSen': self.getWordsInSen(doc),
                'noSen': self.noSen(doc),
                'noUniqueWord': self.noUniqueWord(doc),
                'avgWordLenInSen': self.avgWordLenInSen(doc),
                'avgSenLen': self.avgSenLen(doc),
                "n_complex_clauses": self.get_n_complex_clauses(doc),
                "n_words_before_main_verb": self.get_n_words_before_main_verb(doc),
                'getNoNegativeWords': self.getNoNegativeWords(doc),
                'getNoPositiveWords': self.getNoPositiveWords(doc),
                'getPunctuationcount': self.getPunctuationcount(doc),
                'getNoStopWords': self.getNoStopWords(doc),
                'getWordInDoubleQuote': self.getWordInDoubleQuote(doc),
                
            }
            feature_dicts.append(feature_dict)
        return feature_dicts


In [27]:
segment_featurizer = SegmentFeaturizer()  # more on this below
class CustomLinguisticFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.l = 1
    def fit(self, x, y=None):
        return self
    def transform(self, data):
        return segment_featurizer.featurize(data)


In [28]:
manual_pipeline = Pipeline(
    steps=[
        ("stats", CustomLinguisticFeatureTransformer()),
        ("dict_vect", DictVectorizer()),
        ("classifier", RandomForestClassifier()),
    ]
)
manual_pipeline.fit(train_data, train_target)
y_pred = manual_pipeline.predict(test_data)
cr = classification_report(test_target, y_pred)
cr

'              precision    recall  f1-score   support\n\n           0       0.68      0.72      0.70       353\n           1       0.70      0.66      0.68       345\n\n    accuracy                           0.69       698\n   macro avg       0.69      0.69      0.69       698\nweighted avg       0.69      0.69      0.69       698\n'

#### Combined pipeline - tfidf pipeline + custom pipeline

In [29]:
bow_pipeline = Pipeline(
    steps=[
        ("tfidf", TfidfVectorizer()),
    ]
)
manual_pipeline = Pipeline(
    steps=[
        ("stats", CustomLinguisticFeatureTransformer()),
        ("dict_vect", DictVectorizer()),
    ]
)

In [30]:
from sklearn.pipeline import FeatureUnion

combined_features = FeatureUnion(
    transformer_list=[
        ("manual", manual_pipeline),
        ("bow", bow_pipeline),
    ]
)
final_pipeline = Pipeline(
    steps=[
        ("combined_features", combined_features),
        ("classifier", RandomForestClassifier()),
    ]
)

In [31]:
final_pipeline.fit(train_data, train_target)
y_pred = final_pipeline.predict(test_data)
cr = classification_report(test_target, y_pred)

'              precision    recall  f1-score   support\n\n           0       0.70      0.79      0.74       353\n           1       0.76      0.66      0.70       345\n\n    accuracy                           0.72       698\n   macro avg       0.73      0.72      0.72       698\nweighted avg       0.73      0.72      0.72       698\n'

In [35]:
accuracy_score(test_target, y_pred)

0.7249283667621776

In [32]:
print(cr)

              precision    recall  f1-score   support

           0       0.70      0.79      0.74       353
           1       0.76      0.66      0.70       345

    accuracy                           0.72       698
   macro avg       0.73      0.72      0.72       698
weighted avg       0.73      0.72      0.72       698



In [33]:
import pickle
pickle.dump(final_pipeline, open('custom1_pipe_model.pkl', 'wb'))