In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-news/submit.csv
/kaggle/input/fake-news/train.csv
/kaggle/input/fake-news/test.csv


In [2]:
# Load data
train = pd.read_csv('/kaggle/input/fake-news/train.csv')
test = pd.read_csv('/kaggle/input/fake-news/test.csv')

# Data explore

In [3]:
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
print(train['label'].value_counts())

label
1    10413
0    10387
Name: count, dtype: int64


In [5]:
train.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
train = train.dropna(subset=['text'])

In [7]:
train['sentence_length'] = train['text'].apply(len)
print(train['sentence_length'].describe())

count     20761.000000
mean       4552.715380
std        5130.563491
min           1.000000
25%        1628.000000
50%        3361.000000
75%        6275.000000
max      142961.000000
Name: sentence_length, dtype: float64


# Preprocessing

In [8]:
! pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [9]:
import string
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
train.drop_duplicates(subset='text', keep='first', inplace=True)

In [11]:
train['text'] = train['text'].str.replace(
'['+string.punctuation+']', '', regex=True)

In [12]:
train['text'] = train['text'].str.lower()

In [13]:
train = train.dropna(subset=['text'])
train['text'] = train['text'].astype(str).apply(nltk.word_tokenize)

# SVM

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [15]:
X_train = train['text']
y_train = train['label']

In [16]:
vectorizer = TfidfVectorizer()
X_train = X_train.apply(' '.join)
X_train = vectorizer.fit_transform(X_train)

In [17]:
clf = svm.SVC(
    C = 10,
    gamma = 0.1,
    kernel='rbf'
)
clf.fit(X_train, y_train)

In [18]:
# Training performance
y_train_pred = clf.predict(X_train)
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10387
           1       1.00      1.00      1.00      9999

    accuracy                           1.00     20386
   macro avg       1.00      1.00      1.00     20386
weighted avg       1.00      1.00      1.00     20386



In [19]:
def preprocess_data(data):
    data = data.str.replace('['+string.punctuation+']', '', regex=True)
    data= data.str.lower()
    data = data.astype(str).apply(nltk.word_tokenize)
    data = data.apply(' '.join)
    data = vectorizer.transform(data)
    return data

In [20]:
X_test = preprocess_data(test['text'])
id_test = test['id']
y_test_pred = clf.predict(X_test)

In [21]:
# Create a DataFrame
submission_df = pd.DataFrame({'id': id_test, 'label': y_test_pred})
# Save the DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)


In [22]:
from joblib import dump
dump(clf, 'svm_model.joblib')

['svm_model.joblib']