In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import matplotlib as plt


nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krick\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Ucitavanje i spajanje csv fajlova sa dodatim FAKE atributom da se razlikuje izvor

In [2]:
true_df = pd.read_csv('../datasets/fake-news/True.csv')
fake_df = pd.read_csv('../datasets/fake-news/Fake.csv')

true_df = true_df.assign(fake = [False for _ in true_df.iterrows()])
fake_df = fake_df.assign(fake = [True for _ in fake_df.iterrows()])

df = pd.concat([true_df, fake_df])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 0 to 23480
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   fake     44898 non-null  bool  
dtypes: bool(1), object(4)
memory usage: 1.8+ MB


### Ciscenje teksta

In [3]:
df = df.reset_index(drop=True)

def clean_text(text: str) -> str:
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    STOPWORDS = set(stopwords)
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [4]:
df['text'] = df['text'].apply(clean_text)
df['text'] = df['text'].str.replace('\d+', '')

  


# TF IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [6]:
analyze = vectorizer.build_analyzer()
print(f'Primer clanka: {analyze(df.text[0])}')


Primer clanka: ['washington', 'reuters', 'head', 'conservative', 'republican', 'faction', 'us', 'congress', 'voted', 'month', 'huge', 'expansion', 'national', 'debt', 'pay', 'tax', 'cuts', 'called', 'fiscal', 'conservative', 'sunday', 'urged', 'budget', 'restraint', 'keeping', 'sharp', 'pivot', 'way', 'among', 'republicans', 'us', 'representative', 'mark', 'meadows', 'speaking', 'cbs', 'face', 'nation', 'drew', 'hard', 'line', 'federal', 'spending', 'lawmakers', 'bracing', 'battle', 'january', 'return', 'holidays', 'wednesday', 'lawmakers', 'begin', 'trying', 'pass', 'federal', 'budget', 'fight', 'likely', 'linked', 'issues', 'immigration', 'policy', 'even', 'november', 'congressional', 'election', 'campaigns', 'approach', 'republicans', 'seek', 'keep', 'control', 'congress', 'president', 'donald', 'trump', 'republicans', 'want', 'big', 'budget', 'increase', 'military', 'spending', 'democrats', 'also', 'want', 'proportional', 'increases', 'nondefense', 'discretionary', 'spending', 'pro

In [7]:
X = vectorizer.fit_transform(df.text)
print(vectorizer.get_feature_names()[:20])

['_____', '_______', '___________', '_________________________', '__________________________', '_____burns', '___q__', '___supportsdonaldtrump', '__mrefmessage_bubblefeatured', '__saada', '_a_w_w_r_', '_ad_containerquot', '_all_', '_americasidiot', '_anyone_', '_bigger_', '_brendancarroll', '_brittanyv', '_cappex', '_cillaw']


In [9]:
Y = pd.get_dummies(df['fake']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (44898, 2)


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3,
                                                    random_state = 0)

### Treniranje klasifikatora

In [11]:
from sklearn.naive_bayes import MultinomialNB
classifier_nb = MultinomialNB()
classifier_nb.fit(X_train, y_train[:, 0])

MultinomialNB()

In [12]:
y_pred = classifier_nb.predict(X_test)

# Rezultati

In [13]:
from sklearn.metrics import accuracy_score, classification_report
print(y_test[:, 0])
classification_report_first = classification_report(y_test[:, 0], y_pred)

print('\n Preciznost: ', accuracy_score(y_test[:, 0], y_pred))
print('\nIzvestaj klasifikacije:')
print('======================================================')
print('\n', classification_report_first)

[0 0 0 ... 0 1 1]

 Preciznost:  0.9487750556792873

Izvestaj klasifikacije:

               precision    recall  f1-score   support

           0       0.96      0.94      0.95      7069
           1       0.94      0.96      0.95      6401

    accuracy                           0.95     13470
   macro avg       0.95      0.95      0.95     13470
weighted avg       0.95      0.95      0.95     13470



# Testiranje na novom skupu podataka

In [15]:
new_df = pd.read_csv('../datasets/fake-news/news_articles.csv')
new_df = new_df.assign(fake = [True for _ in new_df.iterrows()])
new_df = new_df[['text', 'fake']]
new_true = pd.read_csv('../datasets/fake-news/articles1.csv')
new_true = new_true.head(len(new_df.index))
new_true = new_true.assign(fake = [False for _ in new_df.iterrows()])
new_true = new_true[['content', 'fake']]
new_true.rename(columns = {'content':'text'}, inplace = True)
new_true.info()

new_df = pd.concat([new_df, new_true])
new_df = new_df.dropna()

new_df = new_df.reset_index(drop=True)

new_df['text'] = new_df['text'].apply(clean_text)
new_df['text'] = new_df['text'].str.replace('\d+', '')



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2096 entries, 0 to 2095
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2096 non-null   object
 1   fake    2096 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 18.5+ KB




In [16]:
X_new = vectorizer.fit_transform(new_df.text)
# get first dimension of X_new
X_new = X_new.toarray()

Y_new = pd.get_dummies(new_df['fake']).values
print(X_new.shape)
print(Y_new.shape)

(4146, 78333)
(4146, 2)


In [83]:
print(X_new[:20])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [14]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new[:,0], Y_new[:,0], test_size = 0.3,
                                                    random_state = 0)

NameError: name 'X_new' is not defined

In [88]:
print(vectorizer.get_feature_names()[:20])
classifier_nb_2 = MultinomialNB()
classifier_nb_2.fit(X_train_new, y_train_new)

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [55]:
y_pred_new = classifier_nb_2.predict(X_test_new)

In [60]:
classification_report_new = classification_report(y_test_new, y_pred_new)

print('\n Preciznost: ', accuracy_score(y_test_new, y_pred_new))
print('\nIzvestaj klasifikacije:')
print('======================================================')
print('\n', classification_report_new)


 Preciznost:  0.5040192926045016

Izvestaj klasifikacije:

               precision    recall  f1-score   support

           0       0.00      0.00      0.00       617
           1       0.50      1.00      0.67       627

    accuracy                           0.50      1244
   macro avg       0.25      0.50      0.34      1244
weighted avg       0.25      0.50      0.34      1244



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
