# Importing the libraries

In [22]:
import numpy as np
import pandas as pd

# Reading Data Sets

In [91]:
fake_data = pd.read_csv("fake.csv")
real_data = pd.read_csv("real.csv")

# Add new column with 0 for fake and 1 for real

In [92]:
fake_data["target"] = 0
real_data["target"] = 1

# Concatenating fake data and real data

In [93]:
dataset = pd.concat([fake_data,real_data],ignore_index = True)

# For suffling :
dataset = dataset.sample(frac=1).reset_index(drop=True)

# Taking a look in the data set

In [26]:
dataset.head()

Unnamed: 0,tweet,target
0,2- the egyptian football association's announc...,1
1,consider the al-ahly match as a legal shoulder...,0
2,abdel hamid bassiouni is the owner of the fast...,1
3,"ibra laughed, people were joking in the middle...",0
4,the best thing about -ahly is that it breaks t...,0


In [27]:
dataset.shape

(41868, 2)

In [31]:
dataset.dtypes

tweet     object
target     int64
dtype: object

## Take a quick look at the `target` column:

In [34]:
dataset['target'].value_counts()

1    21863
0    19988
Name: target, dtype: int64

## Check for missing values

In [28]:
dataset.isnull().sum()

tweet     17
target     0
dtype: int64

In [29]:
dataset.dropna(inplace = True)

len(dataset)

41851

In [30]:
dataset.isnull().sum()

tweet     0
target    0
dtype: int64

# Detect & remove empty strings

In [33]:
blanks = []  # initialize an empty list to store the indices of rows with blank reviews

for i, lb, rv in dataset.itertuples():
    if isinstance(rv, str) and rv.isspace():  # check if 'review' is a string and contains only whitespace
        blanks.append(i)  # add the index to the list
        
print(len(blanks), 'blanks: ', blanks)

0 blanks:  []


## Split the data into train & test sets:

In [38]:
from sklearn.model_selection import train_test_split
X = dataset["tweet"]
y = dataset["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

## Build pipelines to vectorize the data, then train and fit a model


In [67]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier

text_clf_xgb = Pipeline([('tfidf', TfidfVectorizer()), ('clf', XGBClassifier()),])

## Feed the training data through the first pipeline

In [68]:
text_clf_xgb.fit(X_train, y_train)

## Run predictions and analyze the results (Xgboost)

In [69]:
predictions = text_clf_xgb.predict(X_test)

In [70]:
# Report the confusion matrix
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
print(confusion_matrix(y_test,predictions))

[[6158  430]
 [ 379 6844]]


In [71]:
# Print a classification report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.94      0.93      0.94      6588
           1       0.94      0.95      0.94      7223

    accuracy                           0.94     13811
   macro avg       0.94      0.94      0.94     13811
weighted avg       0.94      0.94      0.94     13811



In [72]:
# Print the overall accuracy
print(accuracy_score(y_test,predictions))

0.9414235030048512


## Feed new data into a trained model


In [88]:
review = input("Enter the review : ")

Enter the review : the best thing about -ahly is that it breaks the eye of  in all games within a short period: 26 victories against 2, "o believer"!!\ndespite this humiliation, the mastaba group of the so-called # mortada_mansour comes to praise him and his children, and he really "deserves praise": women, drugs, and laser sessions.. and what was hidden was greater!!!


In [89]:
predict = text_clf_xgb.predict([review])  

In [90]:
if predict == 0:
  print("The tweet is fake")
else:
  print("The tweet is real")

The tweet is fake
