In [1]:
import pandas as pd
import re

from nltk import PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

col = ['target', 'id', 'date', 'flag', 'user', 'text']

df = pd.read_csv(r'training2.1600000.processed.noemoticon.csv', header = None, names = col,  encoding='latin-1')

df.fillna(value='', inplace=True)


# df = df[['text','target']]
df = df.sample(n=10000)

# Handle Categories variable
df['target'] = df['target'].replace(4,1)

ps = PorterStemmer()
sb = SnowballStemmer(language='english')


corpus = []
corpus1 = []

data = []


In [2]:
def get_part_of_day(h):
        if   5 <= h <= 11:
            return 'morning'
        elif 12 <= h <= 17:
            return 'afternoon'
        elif 18 <= h <= 22:
            return 'evening'
        else:
            return 'night'


day = []
month = []
dayInMonth = []
timeOfTweet = []
timePeriod = []
year = []

for d in df['date']:
    sub = d.split(' ')
    day.append(sub[0])
    dayInMonth.append(sub[2])
    timePeriod.append(get_part_of_day(int(sub[3][:2])))

    # time zones is only pdt
    # timeZone.append(sub[4])

    # year is only 2009
    # year.append(sub[5])



data = pd.DataFrame()

data['day'] = day
data['dayInMonth'] = dayInMonth
data['partOfDay'] = timePeriod

data = pd.get_dummies(data, columns=['day','partOfDay'])

data['target'] = df['target'].values
data['text'] = df['text'].values

df = data

In [3]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)


In [4]:
def build_model(bert_layer, max_len=30):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(16, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.4)(net)
    net = tf.keras.layers.Dropout(0.4)(net)
    out = tf.keras.layers.Dense(1, activation='sigmoid')(clf_output)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(optimizer = 'adam', loss='binary_crossentropy', metrics=['accuracy'])
   
    return model


In [5]:
def plot_graph(history,string):
    
    plt.plot(history.history[string],label='training '+string)
    plt.plot(history.history['val_'+string],label='validation '+string)
    plt.legend()
    plt.xlabel('epochs')
    plt.ylabel(string)
    plt.title(string+' vs epochs')
    plt.show()


In [6]:
print("starting text clean")



from nltk.stem import WordNetLemmatizer 

# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()



ps = PorterStemmer()
sb = SnowballStemmer(language='english')

import nltk
nltk.download('stopwords')
nltk.download('wordnet')


all_stopwords = stopwords.words('english')

corpus = []
corpus1 = []



for sen in df['text']:
    # remove hashtags
    sen = re.sub("(@[A-Za-z0-9_]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", ' ', sen)
#     sen = re.sub('(@|#|&)+([a-zA-Z0-9_]+)', '', sen)
    # remove weird chars
    sen = re.sub('[^a-zA-z\'\"]+', ' ', sen)
    # remove urls
    sen = re.sub(r'\$\w*', '', sen)
    # remove old style retweet text "RT"
    sen = re.sub(r'^RT[\s]+', '', sen)
    # remove hyperlinks
    sen = re.sub(r'https?:\/\/.*[\r\n]*', '', sen)
    # only removing the hash # sign from the word
    sen = re.sub(r'#', '', sen)

    

    sen = sen.lower()
    sen = sen.split()

    


    
            # porter stemmer vs snowball stemmer
#     text = [ps.stem(word) for word in sen if not word in set(all_stopwords)]

        # lemmtization vs stemming    (word meaning vs stem)
#   text1 = [sb.stem(word) for word in sen if not word in set(all_stopwords)]
    text1 = [lemmatizer.lemmatize(word) for word in sen if not word in set(all_stopwords)]
    
    
#     text = ' '.join(text)
    text1 = ' '.join(text1)
    
    
#     corpus.append(text)
    corpus1.append(text1)

starting text clean
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vector = vectorizer.fit_transform(corpus1)
print(type(vector))
words = pd.DataFrame.sparse.from_spmatrix(vector)
print(type(words))
df = pd.concat([df, words], axis=1, join='inner')


df = df.drop('text',axis=1)

df[df.columns[15:]] = df.iloc[:, 15:].astype('float16')
y = df.target
X = df.drop('target', axis=1)
X1 = pd.DataFrame.sparse.from_spmatrix(vector)


<class 'scipy.sparse.csr.csr_matrix'>
<class 'pandas.core.frame.DataFrame'>


In [8]:
print("starting splitting data")

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.3, random_state = 42)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y,test_size = 0.3, random_state = 42)


starting splitting data


In [9]:
X.head()

Unnamed: 0,dayInMonth,day_Fri,day_Mon,day_Sat,day_Sun,day_Thu,day_Tue,day_Wed,partOfDay_afternoon,partOfDay_evening,...,11837,11838,11839,11840,11841,11842,11843,11844,11845,11846
0,2,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20,0,0,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,22,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
X1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11837,11838,11839,11840,11841,11842,11843,11844,11845,11846
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
!git clone -b master https://github.com/charles9n/bert-sklearn
%cd bert-sklearn
!pip install .

fatal: destination path 'bert-sklearn' already exists and is not an empty directory.
/content/bert-sklearn
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Processing /content/bert-sklearn
[33m  DEPRECATION: A future pip version will change local packages to be built in-place without first copying to a temporary directory. We recommend you use --use-feature=in-tree-build to test your packages with this new behavior before it becomes the default.
   pip 21.3 will remove support for this functionality. You can find discussion regarding this at https://github.com/pypa/pip/issues/7555.[0m
Building wheels for collected packages: bert-sklearn
  Building wheel for bert-sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for bert-sklearn: filename=bert_sklearn-0.3.1-py3-none-any.whl size=54247 sha256=3e9c8e1c28f166896f8fb9a55e583e2c78df92a526ba26365fd3b1f43e931226
  Stored in directory: /root/.cache/pip/wheels/e7/d4/73/12b2219a5cd4cd8c7ac

In [12]:
%cd ..
%pwd

/content


'/content'

In [13]:
from bert_sklearn import BertClassifier
model = BertClassifier()  

Building sklearn text classifier...


In [14]:
model.fit(X_train, y_train)

# make predictions
y_pred = model.predict(X_test)


# score model on test data
print(model.score(X_test, y_test))

Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint

train data size: 6300, validation data size: 700



  cpuset_checked))


Training  :   0%|          | 0/197 [00:00<?, ?it/s]

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1055.)
  next_m.mul_(beta1).add_(1 - beta1, grad)


Validating:   0%|          | 0/88 [00:00<?, ?it/s]


Epoch 1, Train loss: 0.4999, Val loss: 0.4219, Val accy: 79.71%



Training  :   0%|          | 0/197 [00:00<?, ?it/s]

Validating:   0%|          | 0/88 [00:00<?, ?it/s]


Epoch 2, Train loss: 0.4308, Val loss: 0.4159, Val accy: 79.57%



Training  :   0%|          | 0/197 [00:00<?, ?it/s]

Validating:   0%|          | 0/88 [00:00<?, ?it/s]


Epoch 3, Train loss: 0.4181, Val loss: 0.3992, Val accy: 80.57%



Predicting:   0%|          | 0/375 [00:00<?, ?it/s]

Testing:   0%|          | 0/375 [00:00<?, ?it/s]


Loss: 0.4154, Accuracy: 78.60%
78.60000000000001


In [15]:
model.fit(X_train1, y_train1)

# make predictions
y_pred = model.predict(X_test1)


# score model on test data
print(model.score(X_test1, y_test1))

Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint

train data size: 6300, validation data size: 700



  cpuset_checked))


Training  :   0%|          | 0/197 [00:00<?, ?it/s]

Validating:   0%|          | 0/88 [00:00<?, ?it/s]


Epoch 1, Train loss: 0.5668, Val loss: 0.5531, Val accy: 76.00%



Training  :   0%|          | 0/197 [00:00<?, ?it/s]

Validating:   0%|          | 0/88 [00:00<?, ?it/s]


Epoch 2, Train loss: 0.5608, Val loss: 0.5549, Val accy: 76.00%



Training  :   0%|          | 0/197 [00:00<?, ?it/s]

Validating:   0%|          | 0/88 [00:00<?, ?it/s]


Epoch 3, Train loss: 0.5608, Val loss: 0.5512, Val accy: 76.00%



Predicting:   0%|          | 0/375 [00:00<?, ?it/s]

Testing:   0%|          | 0/375 [00:00<?, ?it/s]


Loss: 0.5557, Accuracy: 75.60%
75.6
