# Imports

In [None]:
!pip install -i https://pypi.clarin-pl.eu lpmn_client

Looking in indexes: https://pypi.clarin-pl.eu
Collecting lpmn_client
  Downloading http://pypi.clarin-pl.eu/packages/lpmn_client-1.3.5.9-py3-none-any.whl
Installing collected packages: lpmn-client
Successfully installed lpmn-client-1.3.5.9


In [None]:
 from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import re, string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from lpmn_client.src.requester import Requester
import zipfile
from tensorflow.keras.utils import to_categorical
import xml.etree.ElementTree as ET
import numbers
import decimal
import tensorflow as tf

Mounted at /content/drive


# Data loading

In [None]:
data_path = '/content/drive/MyDrive/ZIwG P/citations.csv'
tfidf_features_path = '/content/drive/MyDrive/ZIwG P/tfidf_features.csv'
content_root = '/content/drive/MyDrive/ZIwG P'

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_colwidth', 120)

df = pd.read_csv(data_path)
df = df.dropna()
df = df.reset_index(drop=True)
# df.info()

classes_names = {0: 'Fałsz', 1: 'Prawda', 2: "Manipulacja", 3: "Nieweryfikowalne"}
df['label'].replace({'Fałsz': 0, 'Prawda': 1, 'Manipulacja': 2, 'Nieweryfikowalne': 3}, inplace=True)
df = df[(df['label'] == 0) | (df['label'] == 1)]
# X.head()
# y.head(10)
df

Unnamed: 0,content,author,label
0,O ile my podnieśliśmy pensję minimalną w czasie 8 lat? (…) o ponad 90 proc.,Izabela Leszczyna,0
1,W rankingu praworządności World Justice Project w 2020 r. Polska znów spadła na niższą pozycję.,Hanna Gill-Piątek,1
2,"Na Uniwersytecie Warszawskim powstał taki raport, który jest oczywiście państwu znany, z którego wyraźnie wynika, że...",Dariusz Rosati,1
3,Średnia emerytura w Polsce kobiet i mężczyzn różni się aż o 1000 zł.,Małgorzata Kidawa-Błońska,1
4,"Proszę mi pokazać (…) jedną osobę skazaną przez Trybunał Stanu od czasu, kiedy Trybunał Stanu w Polsce, wolnej, tak ...",Paweł Kukiz,0
...,...,...,...
4330,"Chroniliście SKOK-i, chroniliście swoich kolesi. Dlatego z budżetu państwa ponad 4,5 mld zł trzeba było im wypłacić.",Borys Budka,1
4332,"Jeżeli chodzi o penalizację homoseksualizmu, to w Polsce nigdy czegoś takiego nie było. Ani za I Rzeczypospolitej, a...",Kosma Złotowski,0
4333,Rządowy Fundusz Inwestycji Lokalnych na Pomorzu – największy beneficjent gmina miasta Gdańsk.,Marcin Horała,1
4334,"Zarówno WHO, EMA, czyli Europejska Agencja Leków, jak i polskie instytucje, w tym Urząd Rejestracji Produktów Leczni...",Michał Dworczyk,1


# Feature creation

In [None]:
def count_uppercase_letters(text):
    count = sum([1 for char in text if char.isupper()])
    return (count / (len(text) - text.count(' '))) * 100


def count_exclamation_marks(text):
    count = text.count('!')
    return (count / (len(text) - text.count(' '))) * 100


def count_question_marks(text):
    count = text.count('?')
    return (count / (len(text) - text.count(' '))) * 100


def count_quotation_marks(text):
    count = text.count('"')
    return (count / (len(text) - text.count(' '))) * 100


def count_punctuation(text):
    count = sum([1 for char in text if char in string.punctuation])
    return (count / (len(text) - text.count(' '))) * 100


def count_text_length(text):
    return len(text) - text.count(' ')


def get_sentiment(text):
    requester = Requester('241393@student.pwr.edu.pl')
    lpmn_query = 'any2txt|wcrft2|wsd|ccl_emo({"lang":"polish"})|ccl_emo_stats({' \
                 '"lang":"polish", "split_paragraphs": false})'

    string_ids = requester.upload_strings([text])
    response = requester.process_query(lpmn_query, [string_id.text for string_id in string_ids])
    requester.download_response(response[0], './sentiment.zip')

    try:
        with zipfile.ZipFile('sentiment.zip', 'r') as archive:
            with archive.open(archive.namelist()[0]) as data:
                df = pd.read_csv(data, sep=';')

        sentiment_value = sum([int(entry) for entry in df['Polarity'].values if
                               (type(entry) == str and entry.isnumeric()) or isinstance(entry, (int, float, complex))])
    except Exception as e:
        print(e)
        sentiment_value = 0

    return sentiment_value


def count_positive_words():
    try:
        with zipfile.ZipFile('sentiment.zip', 'r') as archive:
            with archive.open(archive.namelist()[0]) as data:
                df = pd.read_csv(data, sep=';')

        positive_words = (sum([1 for entry in df['Polarity'].values if
                               (type(entry) == str and entry.isnumeric() or isinstance(entry,
                                                                                       (int, float, complex))) and int(
                                   entry) > 0]) / len(df['Polarity'])) * 100
    except Exception as e:
        print(e)
        positive_words = 0

    return positive_words


def count_negative_words():
    try:
        with zipfile.ZipFile('sentiment.zip', 'r') as archive:
            with archive.open(archive.namelist()[0]) as data:
                df = pd.read_csv(data, sep=';')

        negative_words = (sum([1 for entry in df['Polarity'].values if
                               (type(entry) == str and entry.isnumeric() or isinstance(entry,
                                                                                       (int, float, complex))) and int(
                                   entry) < 0]) / len(df['Polarity'])) * 100
    except Exception as e:
        print(e)
        negative_words = 0

    return negative_words

In [None]:
# Run only if there is a need to regenerate all the features. Otherwise, go to Loading features.
df['uppercase%'] = df['content'].apply(lambda x: count_uppercase_letters(x))
df['exclamation_mark%'] = df['content'].apply(lambda x: count_exclamation_marks(x))
df['question_mark%'] = df['content'].apply(lambda x: count_question_marks(x))
df['quotation_mark%'] = df['content'].apply(lambda x: count_quotation_marks(x))
df['punctuation%'] = df['content'].apply(lambda x: count_punctuation(x))
df['length'] = df['content'].apply(lambda x: count_text_length(x))

rows_list = []
for index, row in df.iterrows():
    dictionary = {'sentiment': get_sentiment(df.at[index, 'content']),
                  'positive_words%': count_positive_words(),
                  'negative_words%': count_negative_words()}

    rows_list.append(dictionary)
    print(f'Index: {index}')

df = pd.concat([df.reset_index(drop=True), pd.DataFrame(rows_list).reset_index(drop=True)], axis=1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing query complete, file id /requests/makezip/87b8edf9-86d0-4980-b72f-1cdc43da19af
Index: 1173

Processing query complete, file id /requests/makezip/fb79479e-b339-40ed-bed3-41ced41a791b
Index: 1174

Processing query complete, file id /requests/makezip/22d98368-562a-44c7-87b2-a17a3ec2de6f
Index: 1175

Processing query complete, file id /requests/makezip/21c52dfd-2c01-4147-bdff-682c3fd1b634
Index: 1176

Processing query complete, file id /requests/makezip/b3be273c-e815-4159-9243-56933e1681be
Index: 1177

Processing query complete, file id /requests/makezip/36b0af9b-9d30-4b79-ae7a-65109e5b9ac6
Index: 1178

Processing query complete, file id /requests/makezip/5f573ef7-0634-42d2-92de-160edb7baf40
Index: 1179

Processing query complete, file id /requests/makezip/419c6fae-1f63-4a31-aacc-aa42b3a09e29
Index: 1180

Processing query complete, file id /requests/makezip/37e89f26-f816-4d19-a5ca-aaa5aa5d8fa1
Index: 1181

Processi

In [None]:
df.head()

In [None]:
df.to_csv(tfidf_features_path)

# Text preprocessing

In [None]:
def remove_punctuation(text):
    no_punctuation_text = "".join([char for char in text if char not in string.punctuation])
    return no_punctuation_text


def lemmatize(text):
    requester = Requester('241393@student.pwr.edu.pl')
    lpmn_query = 'any2txt|wcrft2({"guesser":false, "morfeusz2":true})'

    string_ids = requester.upload_strings([text])
    response = requester.process_query(lpmn_query, [id.text for id in string_ids])
    requester.download_response(response[0], './lem.zip')

    lemmatized_text = None
    try:
        with zipfile.ZipFile('lem.zip', 'r') as archive:
            data = archive.read(archive.namelist()[0])
            lemmatized_text = [word.text for word in ET.fromstring(data).findall('chunk/sentence/tok/lex/base')]
    except Exception as e:
        print(e)

    return lemmatized_text


def tokenize(text):
    tokens = re.split('\W+', text)
    text = [word for word in tokens]
    return text


def remove_stopwords(text):
    requester = Requester('241393@student.pwr.edu.pl')
    lpmn_query = 'any2txt|morphoDita|dir|termopl2({\"mw\":false,\"sw\":\"/resources/termopl/termopl_sw.txt\",' \
                 '\"cp\":\"/resources/termopl/termopl_cp.txt\"}) '

    string_ids = requester.upload_strings([text])
    response = requester.process_query(lpmn_query, [string_id.text for string_id in string_ids])
    requester.download_response(response[0], './no_stopwords.zip')

    lemmatized_text_without_stopwords = None
    try:
        with zipfile.ZipFile('no_stopwords.zip', 'r') as archive:
            with archive.open(archive.namelist()[0]) as data:
                column_names = ['idx', 'ranking', 'output_phrase', 'original_phrase', 'c-value',
                                'length', 'freq_s', 'freq_in', 'context']
                df = pd.read_csv(data, sep='\t', names=column_names)

        lemmatized_text_without_stopwords = df['output_phrase'].tolist()
    except IndexError as e:
        print(e)
    except Exception as e:
        print(e)
        
    return lemmatized_text_without_stopwords


#TODO when we decide what how to preprocess text
# now it is just copy-paste from online course
def clean_text(text):
    no_punctuation_text = remove_punctuation(text)
    cleaned_text = lemmatize(no_punctuation_text)
    return cleaned_text

In [None]:
# Took only 5 first elements to demonstrate the output
df['lemmatized'] = df['content'][:5].apply(lambda x: lemmatize(x))
df.head()

In [None]:
df['lemmatized_without_stopwords'] = df['content'][:5].apply(lambda x: remove_stopwords(x))
df.head()

In [None]:
df['clean_text'] = df['content'].apply(lambda x: clean_text(x))
df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df[['clean_text', 'uppercase%', 'exclamation_mark%', 'question_mark%', 'quotation_mark%', 'punctuation%',
        'length', 'sentiment', 'positive_words%', 'negative_words%']], df['label'], test_size=0.25)

In [None]:
X_train

## Vectorization

### N-gram

In [None]:
# range = (2, 2)

# ngram_vect = CountVectorizer(ngram_range=range)
# X_ngram = ngram_vect.fit_transform(X['content'])

# X_ngram_df = pd.DataFrame(X_ngram.toarray())
# X_ngram_df.columns = ngram_vect.get_feature_names()
# X_ngram_df

### TF-IDF

In [None]:
clean_joined = df['clean_text'].apply(lambda x: ' '.join(x))
X_train_clean_joined = X_train['clean_text'].apply(lambda x: ' '.join(x))
X_test_clean_joined = X_test['clean_text'].apply(lambda x: ' '.join(x))

tfidf_vect = TfidfVectorizer(ngram_range=(1, 3))
tfidf_vect_fit = tfidf_vect.fit(X_train_clean_joined)

tfidf_train = tfidf_vect_fit.transform(X_train_clean_joined)
tfidf_test = tfidf_vect_fit.transform(X_test_clean_joined)

X_train_vect = pd.concat([X_train[
                              ['uppercase%', 'exclamation_mark%', 'question_mark%', 'quotation_mark%', 'punctuation%',
                               'length', 'sentiment', 'positive_words%', 'negative_words%']].reset_index(drop=True),
                          pd.DataFrame(tfidf_train.toarray())], axis=1)

X_test_vect = pd.concat([X_test[
                             ['uppercase%', 'exclamation_mark%', 'question_mark%', 'quotation_mark%', 'punctuation%',
                              'length', 'sentiment', 'positive_words%', 'negative_words%']].reset_index(drop=True),
                         pd.DataFrame(tfidf_test.toarray())], axis=1)

In [None]:
# Saving training and testing sets
X_train_vect.to_csv(f'{content_root}/X_train_tfidf.csv', index=False, header=True)
X_test_vect.to_csv(f'{content_root}/X_test_tfidf.csv', index=False, header=True)
y_train.to_csv(f'{content_root}/y_train_tfidf.csv', index=False, header=True)
y_test.to_csv(f'{content_root}/y_test_tfidf.csv', index=False, header=True)

In [None]:
# Saving all the features (only for GridSearchCV)
X_tfidf = tfidf_vect.fit_transform(clean_joined)
X_tfidf_feat = pd.concat([df[['uppercase%', 'exclamation_mark%', 'question_mark%', 'quotation_mark%', 'punctuation%',
                              'length', 'sentiment', 'positive_words%', 'negative_words%']].reset_index(drop=True),
                          pd.DataFrame(X_tfidf.toarray())], axis=1)

X_tfidf_feat.to_csv(tfidf_features_path)

In [None]:
# X_test_vect[0].toarray()

In [None]:
# Loading features
X_tfidf_feat = pd.read_csv(tfidf_features_path)

# Random Forest

In [None]:
rf = RandomForestClassifier(random_state=1410)
param = {
    'max_depth': [80, 90, 100, 110, None],
    'max_features': [0.2, None],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1, return_train_score=True)
gs_fit = gs.fit(X_tfidf_feat, data['label'])

In [None]:
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

# Model nr 2

# Deep Learning

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_seq_pad = pad_sequences(X_train_seq, len(max(X_train_seq, key=len)))
X_test_seq_pad = pad_sequences(X_test_seq, len(max(X_train_seq, key=len)))



In [None]:
X_train_seq_pad.shape

## RNN model

In [None]:
batch_size = 64

model = Sequential()
model.add(Embedding(len(tokenizer.index_word)+1, 32))
model.add(LSTM(64, dropout=0.3, recurrent_dropout=0.3, recurrent_initializer='glorot_uniform'))
model.add(Dense(64, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))
model.add(Dense(2, activation='softmax'))
model.summary()


In [None]:
model.compile(optimizer='adam',
            #   loss='categorical_crossentropy',
              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True)
              metrics=['accuracy'])

In [None]:
early_stopping = EarlyStopping(monitor='loss', patience=3)

y_train_cat = to_categorical(y_train, 2)
y_test_cat = to_categorical(y_test, 2)

history = model.fit(X_train_seq_pad, y_train_cat, validation_data=(X_test_seq_pad, y_test_cat), batch_size=batch_size, epochs=32, callbacks=[early_stopping])

In [None]:
prediction = model.predict(X_test_seq_pad[16])
print(len(prediction))
# print(len(prediction))
# prediction = int(stats.mode(prediction).mode)
# print('Treść wypowiedzi:')
# print(X_test.iloc[48])
# print('Klasyfikacja: ', classes_names[prediction])
# predictions = np.argmax(model.predict(test_image_gen), axis=-1)

# plt.figure(figsize=(10,6))
# sns.heatmap(confusion_matrix(test_image_gen.classes,predictions),annot=True)

In [None]:
result = tf.argmax(model.predict_on_batch(tf.expand_dims(X_test_seq_pad[16], 0)), axis=1)

# Evaluation