## Načítanie knižníc

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

## Načítanie dát

In [3]:
df_train = pd.read_csv('data/Twitter/twitter_training.csv')
df_test = pd.read_csv('data/Twitter/twitter_validation.csv')

df_train

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [4]:
df_test

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...,...,...,...
994,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
995,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
997,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


## Označenie stĺpcov

In [5]:
df_train.columns=['id','info','target','tweet']
df_test.columns=['id','info','target','tweet']

In [6]:
df_train.head()

Unnamed: 0,id,info,target,tweet
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [7]:
df_test.head()

Unnamed: 0,id,info,target,tweet
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


## Predspracovanie dáta
#### Odstránenie irelevantných stĺpcov

In [8]:
df_train.drop(['id','info'], axis=1, inplace=True)
df_test.drop(['id','info'], axis=1, inplace=True)

In [9]:
df_train.isna().sum()

target      0
tweet     686
dtype: int64

#### Odstránenie chýbajúcich hodnôt

In [10]:
df_train.dropna(inplace=True)

#### Odstránenie duplikátov

In [11]:
df_train.duplicated().sum()

np.int64(4227)

In [12]:
df_train.drop_duplicates(inplace=True)

#### Odstránenie špeciálnych znakov, prevedenie na malé písmo a odstránenie stop slov

In [13]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text

df_train["cleaned_tweet"] = df_train["tweet"].apply(clean_text)

[nltk_data] Downloading package stopwords to /Users/m4/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Tokenizácia

In [14]:
nltk.download("punkt_tab")

def tokenize_text(text):
    return word_tokenize(text)

df_train["tokenized_tweet"] = df_train["cleaned_tweet"].apply(tokenize_text)

[nltk_data] Downloading package punkt_tab to /Users/m4/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


#### Lemmatizácia

In [15]:
nltk.download("wordnet")

def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

df_train["lemmatized_tweet"] = df_train["tokenized_tweet"].apply(lemmatize_text)


[nltk_data] Downloading package wordnet to /Users/m4/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Extrakcia príznakov


#### Vektorová reprezentácia pomocou TF-IDF

In [16]:
df_train["tf-idf_apply"] = df_train["lemmatized_tweet"].apply(lambda tokens: " ".join(tokens))

vectorizer = TfidfVectorizer(max_features=7500)  
X = vectorizer.fit_transform(df_train["tf-idf_apply"])

In [17]:
df_train.head(10)

Unnamed: 0,target,tweet,cleaned_tweet,tokenized_tweet,lemmatized_tweet,tf-idf_apply
0,Positive,I am coming to the borders and I will kill you...,coming borders kill,"[coming, borders, kill]","[coming, border, kill]",coming border kill
1,Positive,im getting on borderlands and i will kill you ...,im getting borderlands kill,"[im, getting, borderlands, kill]","[im, getting, borderland, kill]",im getting borderland kill
2,Positive,im coming on borderlands and i will murder you...,im coming borderlands murder,"[im, coming, borderlands, murder]","[im, coming, borderland, murder]",im coming borderland murder
3,Positive,im getting on borderlands 2 and i will murder ...,im getting borderlands murder,"[im, getting, borderlands, murder]","[im, getting, borderland, murder]",im getting borderland murder
4,Positive,im getting into borderlands and i can murder y...,im getting borderlands murder,"[im, getting, borderlands, murder]","[im, getting, borderland, murder]",im getting borderland murder
5,Positive,So I spent a few hours making something for fu...,spent hours making something fun dont know hug...,"[spent, hours, making, something, fun, dont, k...","[spent, hour, making, something, fun, dont, kn...",spent hour making something fun dont know huge...
6,Positive,So I spent a couple of hours doing something f...,spent couple hours something fun dont know im ...,"[spent, couple, hours, something, fun, dont, k...","[spent, couple, hour, something, fun, dont, kn...",spent couple hour something fun dont know im h...
7,Positive,So I spent a few hours doing something for fun...,spent hours something fun dont know im huge bo...,"[spent, hours, something, fun, dont, know, im,...","[spent, hour, something, fun, dont, know, im, ...",spent hour something fun dont know im huge bor...
8,Positive,So I spent a few hours making something for fu...,spent hours making something fun dont know hug...,"[spent, hours, making, something, fun, dont, k...","[spent, hour, making, something, fun, dont, kn...",spent hour making something fun dont know huge...
9,Positive,2010 So I spent a few hours making something f...,spent hours making something fun dont know hug...,"[spent, hours, making, something, fun, dont, k...","[spent, hour, making, something, fun, dont, kn...",spent hour making something fun dont know huge...


## Rozdelenie dát na trénovacie a testovacie

## Modelovanie

## Vyhodnotenie modelov