In [1]:
# Import data
import pandas as pd
data = pd.read_csv('kaggle_reddit-nsfw-classification-data.csv')

# Sample data for faster runtimes
data = data.sample(frac=0.1, random_state=42)

data.head()

Unnamed: 0,title,subreddit,is_nsfw
363285,If only Sibelius had 22va lines...,classicalmusic,False
82884,[buy] SPH video or snap session,Sexsells,True
515983,Did Bill Clinton bang Kyle’s Mum?,southpark,True
489579,I painted this over the weekend and then found...,painting,False
179433,Where are Promenade and Farmland in Ground War...,modernwarfare,False


In [2]:
# examine the class distribution
data.is_nsfw.value_counts()

is_nsfw
False    51614
True     10181
Name: count, dtype: int64

In [3]:
# convert label to a numerical variable
data['label_num'] = data.is_nsfw.map({False:0, True:1})
data.head()

Unnamed: 0,title,subreddit,is_nsfw,label_num
363285,If only Sibelius had 22va lines...,classicalmusic,False,0
82884,[buy] SPH video or snap session,Sexsells,True,1
515983,Did Bill Clinton bang Kyle’s Mum?,southpark,True,1
489579,I painted this over the weekend and then found...,painting,False,0
179433,Where are Promenade and Farmland in Ground War...,modernwarfare,False,0


In [4]:
# split X and y into training and testing sets

X = data['title']
y = data['label_num']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.7)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(43256,)
(18539,)
(43256,)
(18539,)


In [5]:
# Import NLTK and make sure the relevant libraries are downloaded
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gargoth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gargoth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Extract the tweets
titles = data['title'].dropna()

# Convert all tweets to lowercase
titles = titles.str.lower()

# Remove punctuation from tweets, remove stopwords then tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# Create Tokenizer

import re

def tokenizer(text):
    # URL Removal
    text = re.sub(r"http\S+", "", text)
    
    # Punctuation Removal
    text = ''.join([c for c in text if c.isalnum() or c == ' '])
    
    # Tokenization
    tok_list = nltk.word_tokenize(text)
    
    # Stopword Removal
    tok_list = [word for word in tok_list if word not in stopwords.words('english')]
    
    # Lemmatization
    tok_list = [lemmatizer.lemmatize(word) for word in tok_list]
    
    return tok_list


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import decomposition

# Generate features
vectorizer = CountVectorizer(tokenizer=tokenizer,
                             max_df=0.8, token_pattern = None,
                             ngram_range=(1,1))

# User Dask joblib to parallelize task
import joblib
from dask.distributed import Client

client = Client(processes=False)

with joblib.parallel_backend('dask'):
    X_final_train = vectorizer.fit_transform(X_train)
    X_final_test = vectorizer.transform(X_test)

In [8]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_final_train, y_train)

y_final_pred = nb.predict(X_final_test)

In [9]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_final_pred)

0.9131021090673714