In [256]:
#!/bin/env python3.6

import pandas as pd
from urllib.parse import urlparse
import os, re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
wordnet_lemmatizer = WordNetLemmatizer()
porter_stemmer = PorterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package stopwords to /Users/MSG/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/MSG/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In this work, the idea of detecting phony links is taken from an article on <br>
**TowardDataScience.com** about **"Phishing URL Detection with ML"** <br>
link: https://towardsdatascience.com/phishing-domain-detection-with-ml-5be9c99293e5

In [257]:
path = os.getcwd()
filename = 'Sarcasm_Headlines_Dataset.json'
full_file_name = os.path.join(path, filename)

#### Reading the json file and making a dataframe (df)

In [258]:
dd= pd.read_json(full_file_name, chunksize=10000, lines=True)
df = pd.concat(dd)

#### Parsing the links with urlparse - urlparse gives scheme, netloc, path, params and query parts of a link. 

In [260]:
df['parsed_links'] = df.article_link.apply(lambda x: urlparse(x))

An example of parse resutls: <br>
 - ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')

In [261]:
df.head()

Unnamed: 0,article_link,headline,is_sarcastic,parsed_links
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,"(https, www.huffingtonpost.com, /entry/versace..."
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,"(https, www.huffingtonpost.com, /entry/roseann..."
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,"(https, local.theonion.com, /mom-starting-to-f..."
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,"(https, politics.theonion.com, /boehner-just-w..."
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,"(https, www.huffingtonpost.com, /entry/jk-rowl..."


#### Creating new feature by extracting the netloc and length of the path of each link 

In [263]:
df['netloc'] = df.parsed_links.apply(lambda x: x.netloc)
df['path'] = df.parsed_links.apply(lambda x: len(x.path))
# For now only work on the length of path and neloc name! 
# df['query'] = df.parsed_links.apply(lambda x: x.query)

#### Categorizing length of the path of each link - 'short16','short53', 'medium65', 'long100', 'crazylong184' for  links with character length of 15, 53, 65, 67, 100 and 185. 


In [264]:
bins = [15, 53, 65, 67, 100, 185]
category = pd.cut(df.path, bins, labels=['short16','short53', 'medium65', 'long100', 'crazylong184'])
category = category.to_frame()
category.columns = ['category_length']
df = pd.concat([df, category], axis=1)
df['text'] = df[['netloc', 'category_length', 'headline']].apply(lambda x: ' '.join(x), axis=1)

#### We need to clean, stem and lemattize the text to make it ready for trainig our model with: 
In this psrt we do steps as below:
    
- Removing all the special characters
- Removing all single characters
- Removing single characters from the start
- Substituting multiple spaces with single space
- Removing prefixed 'b'
- Converting to Lowercase
- Lemmatization based on WordNet's morphy function
- Stemming using the Porter Stemming Algorithm,

In [265]:
def clean_stem_lemattize(x):
    """Applies some pre-processing on the given text.

        Steps :
        - Removing all the special characters
        - Removing all single characters
        - Removing single characters from the start
        - Substituting multiple spaces with single space
        - Removing prefixed 'b'
        - Converting to Lowercase
        - Lemmatization based on WordNet's morphy function
        - Stemming using the Porter Stemming Algorithm,
        """
    x = re.sub(r'\W+', ' ', x)
    x = re.sub(r'\s+[a-zA-Z]\s+', ' ', x)
    x = re.sub(r'\^[a-zA-Z]\s+', ' ', x)
    x = re.sub(r'\s+', ' ',x)
    x = x.lower()
    x = [porter_stemmer.stem(wordnet_lemmatizer.lemmatize(word)) for word in x.split()]
    
    return ' '.join(x)


import time
start = time.time()
df.text = df.text.apply(clean_stem_lemattize)
print(time.time() -  start)

10.866698980331421


#### Now time to train our model. First, we need to vectorize our text and then split our data to train and test sections. <br>
- I chose RandomForestClassifier as a good model for this classification problem. 


In [268]:
X = df.text
y = df.is_sarcastic
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.65, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(documents).toarray()


print('Starting Splitting Train_Test')
print('#'*30)

# dividing data into 20% test set and 80% training set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print('Starting Fitting Model')
print('#'*30)

classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(X_train, y_train)

print('Starting Prediction!!')
print('#'*30)

y_pred = classifier.predict(X_test)
    
    
#%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%#
#%!% -------------------This part is for checking accuracy of the model!--------------------- %!%!#
#%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%!%#

print('Starting test Score!!!')
print('#'*30)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

Starting Splitting Train_Test
Starting Fitting Model
##############################
Starting Prediction!!
##############################
Starting test Score!!!
##############################
[[3007    0]
 [   0 2335]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3007
           1       1.00      1.00      1.00      2335

   micro avg       1.00      1.00      1.00      5342
   macro avg       1.00      1.00      1.00      5342
weighted avg       1.00      1.00      1.00      5342

1.0


#### Done! Our model predicts whether an article is sarcastic (phony) or not based on its link and the headline of the article! 