In [1]:
import pandas as pd
import numpy as np

In [2]:

def load_format_data(file_path, is_fake):
    df = pd.read_csv(file_path)
    df.insert(0, "is_fake", is_fake)
    return df

df_fake = load_format_data("./data/Fake.csv", 1)
df_true = load_format_data("./data/True.csv", 0)
df = pd.concat([df_fake, df_true], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)
duplicates = df.duplicated(subset=["text"], keep=False)

print(" dup news :", duplicates)

print(df.info())



 dup news : 0        False
1        False
2         True
3        False
4        False
         ...  
44893    False
44894    False
44895    False
44896     True
44897     True
Length: 44898, dtype: bool
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   is_fake  44898 non-null  int64 
 1   title    44898 non-null  object
 2   text     44898 non-null  object
 3   subject  44898 non-null  object
 4   date     44898 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.7+ MB
None


In [3]:
from collections import Counter

fake_words = Counter(" ".join(df[df["is_fake"]==1]["text"]).split())
real_words = Counter(" ".join(df[df["is_fake"]==0]["text"]).split())

exclusive_fake = set(fake_words) - set(real_words)
exclusive_real = set(real_words) - set(fake_words)

print("Fake sample words:", list(exclusive_fake)[:20])
print("Real sample words :", list(exclusive_real)[:20])

Fake sample words: ['Proficiency', 'ugly.Brandon', 'Sheindie', 'THEY?!With', '2017Nancy', '(D-SC)', 'Violence.Here', 'others.These', 'letters:', 'wage-rule', 'printouts', '2016Finally,', 'Amendment.The', 'dole!Palo', '.https://twitter.com/P0TUSTrump/status/773724216247984129Trump', 'pic.twitter.com/cvQP6Fegz4', '#fallfashion', 'DailyMailOnline', '$117,425,683Chile', 'more.There']
Real sample words : ['Zawy', 'Senegal.', 'egregiousness', 'DOUBLELINE', 'expensive,”', '“clawback”', '“Voters', 'precedent.”', 'TransWest', 'Ceresney,', '“accusations', '62.48', 'hold?”', 'watch.”', 'Palanker.', 'redistricting.', 'Representatives.”', 'Gotthard', 'campaigners’', 'APG']


In [None]:
## As the dataset is full of duplicates news, we remove them to be able to train our model more accuratly

df = df.drop(index=np.where(duplicates)[0])

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.preprocessing import OneHotEncoder

text_col = ["title", "text"]
df['full_text'] = df[text_col].fillna('').agg(' '.join, axis=1)

vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_text = vectorizer.fit_transform(df['full_text'][1:])

# Réduction de dimensionnalité avec SVD
svd = TruncatedSVD(n_components=50, random_state=42)
X_text_reduced = svd.fit_transform(X_text)

encoder = OneHotEncoder()
X_num_scaled = encoder.fit_transform(np.array(df["subject"][1:]).reshape(-1, 1))
X_num_scaled_dense = X_num_scaled.toarray()

X = np.hstack([X_num_scaled_dense, X_text_reduced])

print(X)


[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  4.69663986e-02
  -7.56175746e-03 -6.82923348e-03]
 [ 1.00000000e+00  0.00000000e+00  0.00000000e+00 ... -4.06542373e-02
  -3.72816940e-02 -1.30096507e-02]
 [ 0.00000000e+00  1.00000000e+00  0.00000000e+00 ...  3.31941197e-02
   1.69316288e-02  4.26213405e-02]
 ...
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ... -3.57372910e-03
  -2.89139248e-03 -3.12366127e-02]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  9.02913919e-04
   3.97174220e-02  1.68906563e-02]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  1.51597287e-02
  -2.81053654e-02  6.09964133e-03]]


In [6]:
# print(X_text.shape)
# print(X_num_scaled.shape)
# X = np.hstack([X_num_scaled.toarray(), X_text.toarray()])
# print(X.shape)

In [7]:
from sklearn.model_selection import train_test_split

label = df.pop("is_fake")
label = label[1:]


print(label.shape)

X_train, X_test, y_train, y_test = train_test_split(X, label, test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(33505,)
(26804, 57)
(6701, 57)
(26804,)


In [8]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)


In [9]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"RandomForest Test Accuracy: {accuracy:.4f}")

RandomForest Test Accuracy: 1.0000


In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=0.01)
lr.fit(X_train, y_train)
train_accuracy = lr.score(X_train, y_train)
print(f"LogisticRegression Train Accuracy: {train_accuracy:.5f}")
test_accuracy = lr.score(X_test, y_test)
print(f"LogisticRegression Test Accuracy: {test_accuracy:.5f}")

LogisticRegression Train Accuracy: 0.98649
LogisticRegression Test Accuracy: 0.98776


In [11]:
from sklearn.dummy import DummyClassifier

dc = DummyClassifier()
dc.fit(X_train, y_train)
test_dc_accuracy = dc.score(X_test, y_test)
print(test_dc_accuracy)

0.6206536337860021
