In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Data cleaning:
import re
from collections import Counter 
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
import string
#from spellchecker import SpellChecker
# Modeling basics:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
# For D2V + RF:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import utils
# For Glove + LSTM:
from tqdm import tqdm

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('../input/swisstext-german/data_class.csv', encoding='utf-8')
df = df.drop(['Unnamed: 0.1'], axis=1)
df

Unnamed: 0.1,Unnamed: 0,source1,no_words_inSent_SS,no_words_inSent_SK,noCap_LetterWords_inSentence,sim_sent
0,0,Minghella Sohn italienisch-schottischer Eltern...,1,0,7,1
1,1,Nach Schulabschluss studierte Universität Hul...,0,0,6,0
2,2,1978 drehte ersten Kurzfilm .,0,0,1,0
3,3,Seit 1981 Autor Story Editor tätig .,0,0,4,0
4,4,"Er wurde Theaterstücken , Rundfunkhörspielen...",0,0,9,0
...,...,...,...,...,...,...
3513930,3513930,Carl Sigmans Text erschien 1960er Jahren briti...,0,0,7,1
3513931,3513931,Camillo Felgen alias Heinz Helmer verfasste de...,5,1,12,0
3513932,3513932,Zahlreiche Interpreten sangen 1964 Lied franzo...,1,1,11,1
3513933,3513933,Von Letzterer stammt spanischsprachige Fassung...,2,1,6,0


In [3]:
df['sim_sent'] = np.where((df.noCap_LetterWords_inSentence>8.0) | (df.no_words_inSent_SS>=2.0), 'Yes', 'No')

In [4]:
size = np.random.rand(len(df)) < 0.6
df2=df[size]

In [5]:
train, test = train_test_split(df2, test_size=0.3, random_state=42)

In [6]:
nltk.download('punkt')
def tokenize_text(text):
    tokens = []
    for word in nltk.word_tokenize(text, language = "german"):
        if len(word) < 2:
            continue
        tokens.append(word.lower())
    return tokens

train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['source1']), tags=[r.sim_sent]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['source1']), tags=[r.sim_sent]), axis=1)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
import multiprocessing

cores = multiprocessing.cpu_count()

In [8]:
model_dbow = Doc2Vec(dm=0, vector_size=200, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 1476085/1476085 [00:00<00:00, 2264532.92it/s]


In [9]:
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 1476085/1476085 [00:00<00:00, 2255029.34it/s]
100%|██████████| 1476085/1476085 [00:00<00:00, 2389722.33it/s]
100%|██████████| 1476085/1476085 [00:00<00:00, 2370672.69it/s]
100%|██████████| 1476085/1476085 [00:00<00:00, 2405720.91it/s]
100%|██████████| 1476085/1476085 [00:00<00:00, 2345841.36it/s]
100%|██████████| 1476085/1476085 [00:00<00:00, 2300735.07it/s]
100%|██████████| 1476085/1476085 [00:00<00:00, 2346819.50it/s]
100%|██████████| 1476085/1476085 [00:00<00:00, 2375353.25it/s]
100%|██████████| 1476085/1476085 [00:00<00:00, 2379189.84it/s]
100%|██████████| 1476085/1476085 [00:00<00:00, 2344052.60it/s]
100%|██████████| 1476085/1476085 [00:00<00:00, 2342677.79it/s]
100%|██████████| 1476085/1476085 [00:00<00:00, 2325155.32it/s]
100%|██████████| 1476085/1476085 [00:00<00:00, 2332355.57it/s]
100%|██████████| 1476085/1476085 [00:00<00:00, 2379589.46it/s]
100%|██████████| 1476085/1476085 [00:00<00:00, 2377451.21it/s]
100%|██████████| 1476085/1476085 [00:00<00:00, 2357090.

In [10]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [11]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(n_jobs=1, C=1e5,random_state=0)
lr_classifier.fit(X_train, y_train)
y_pred = lr_classifier.predict(X_test)
print("\n============ Logistic regression")
print("---- confusion_matrix:")
print(confusion_matrix(y_test,y_pred))
print("---- classification_report:")
print(classification_report(y_test,y_pred))
print("---- accuracy_score:")
print(accuracy_score(y_test,y_pred))


---- confusion_matrix:
[[326295  47923]
 [ 99598 158792]]
---- classification_report:
              precision    recall  f1-score   support

          No       0.77      0.87      0.82    374218
         Yes       0.77      0.61      0.68    258390

    accuracy                           0.77    632608
   macro avg       0.77      0.74      0.75    632608
weighted avg       0.77      0.77      0.76    632608

---- accuracy_score:
0.7668050356618948


In [12]:
from sklearn.naive_bayes import GaussianNB
print("n============ Fitting Naive Bayes...")
gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train, y_train)
y_pred = gnb_classifier.predict(X_test)
print("\n============ Naive Bayes")
print("---- confusion_matrix:")
print(confusion_matrix(y_test,y_pred))
print("---- classification_report:")
print(classification_report(y_test,y_pred))
print("---- accuracy_score:")
print(accuracy_score(y_test,y_pred))


---- confusion_matrix:
[[338805  35413]
 [115801 142589]]
---- classification_report:
              precision    recall  f1-score   support

          No       0.75      0.91      0.82    374218
         Yes       0.80      0.55      0.65    258390

    accuracy                           0.76    632608
   macro avg       0.77      0.73      0.74    632608
weighted avg       0.77      0.76      0.75    632608

---- accuracy_score:
0.7609672972836259


In [13]:
from sklearn.ensemble import RandomForestClassifier
print("n============ Fitting Random Forest...")
forest_classifier = RandomForestClassifier()
forest_classifier.fit(X_train, y_train)

y_pred = forest_classifier.predict(X_test)
print("\n============ Random Forest")
print("---- confusion_matrix:")
print(confusion_matrix(y_test,y_pred))
print("---- classification_report:")
print(classification_report(y_test,y_pred))
print("---- accuracy_score:")
print(accuracy_score(y_test,y_pred))


---- confusion_matrix:
[[324182  50036]
 [ 98577 159813]]
---- classification_report:
              precision    recall  f1-score   support

          No       0.77      0.87      0.81    374218
         Yes       0.76      0.62      0.68    258390

    accuracy                           0.77    632608
   macro avg       0.76      0.74      0.75    632608
weighted avg       0.76      0.77      0.76    632608

---- accuracy_score:
0.7650788481966716


In [14]:
from sklearn import svm
svm_classifier = svm.LinearSVC()
print("n============ Support Vector Machine...")
svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_test)
print("\n============ Support Vector Machine")
print("---- confusion_matrix:")
print(confusion_matrix(y_test,y_pred))
print("---- classification_report:")
print(classification_report(y_test,y_pred))
print("---- accuracy_score:")
print(accuracy_score(y_test,y_pred))


---- confusion_matrix:
[[329293  44925]
 [103052 155338]]
---- classification_report:
              precision    recall  f1-score   support

          No       0.76      0.88      0.82    374218
         Yes       0.78      0.60      0.68    258390

    accuracy                           0.77    632608
   macro avg       0.77      0.74      0.75    632608
weighted avg       0.77      0.77      0.76    632608

---- accuracy_score:
0.7660842101269665


In [15]:
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(n_jobs=1, C=1e5,random_state=0)
lr_classifier.fit(X_train, y_train)
y_pred = lr_classifier.predict(X_test)
print("\n============ Logistic regression")
print("---- confusion_matrix:")
print(confusion_matrix(y_test,y_pred))
print("---- classification_report:")
print(classification_report(y_test,y_pred))
print("---- accuracy_score:")
print(accuracy_score(y_test,y_pred))


---- confusion_matrix:
[[326295  47923]
 [ 99598 158792]]
---- classification_report:
              precision    recall  f1-score   support

          No       0.77      0.87      0.82    374218
         Yes       0.77      0.61      0.68    258390

    accuracy                           0.77    632608
   macro avg       0.77      0.74      0.75    632608
weighted avg       0.77      0.77      0.76    632608

---- accuracy_score:
0.7668050356618948
