In [4]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import plotly.express as px

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [5]:
raw_df = pd.read_csv('train-balanced-sarcasm.csv')
raw_df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [6]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010826 entries, 0 to 1010825
Data columns (total 10 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   label           1010826 non-null  int64 
 1   comment         1010771 non-null  object
 2   author          1010826 non-null  object
 3   subreddit       1010826 non-null  object
 4   score           1010826 non-null  int64 
 5   ups             1010826 non-null  int64 
 6   downs           1010826 non-null  int64 
 7   date            1010826 non-null  object
 8   created_utc     1010826 non-null  object
 9   parent_comment  1010826 non-null  object
dtypes: int64(4), object(6)
memory usage: 77.1+ MB


In [5]:
raw_df.isnull().sum()
raw_df.dropna(inplace=True)

In [7]:
# filter out the sarcastic comments
sarcastic_comments = raw_df[raw_df['label'] == 1]

# filter out the non-sarcastic comments
n_sarcastic_comments = raw_df[raw_df['label'] == 0]

In [27]:
# sub reddit in sarcastic comments
sarcastic_comments['subreddit'].value_counts().head(10)

subreddit
AskReddit          0.401465
GlobalOffensive         NaN
funny              0.451474
leagueoflegends    0.542330
news               0.603457
nfl                     NaN
pcmasterrace       0.566674
pics               0.484400
politics           0.605378
todayilearned      0.547560
worldnews          0.642529
Name: count, dtype: float64

In [10]:
# sub reddit in non-sarcastic comments
n_sarcastic_comments['subreddit'].value_counts().head(10)

subreddit
AskReddit          39310
politics           15586
funny               9840
leagueoflegends     9628
worldnews           9429
pics                8329
pcmasterrace        8228
nfl                 6935
nba                 6698
news                6698
Name: count, dtype: int64

In [39]:
# percentage of sarcastic comments in each subreddit in descending order
percent = sarcastic_comments['subreddit'].value_counts() / raw_df['subreddit'].value_counts()
percent.sort_values(ascending=False, inplace=True)
percent.dropna(inplace=True)
percent

subreddit
0x10c               1.000000
helpmebuildapc      1.000000
Luna_Lovewell       1.000000
highspeedrail       1.000000
M43                 1.000000
                      ...   
magicskyfairy       0.033333
pumparum            0.025641
ledootgeneration    0.022727
ACTrade             0.019608
friendsafari        0.014085
Name: count, Length: 8995, dtype: float64

In [48]:
# comments in each subreddit in descending order
comments = raw_df['subreddit'].value_counts()
comments.sort_values(ascending=False, inplace=True)
comments.head(10)

# filter out the subreddits with more than 5000 comments
comments_5000 = comments[comments > 5000]
comments_5000

# list of subreddits with more than 5000 comments
subreddits = comments_5000.index.tolist()
subreddits

['AskReddit',
 'politics',
 'worldnews',
 'leagueoflegends',
 'pcmasterrace',
 'funny',
 'news',
 'pics',
 'todayilearned',
 'nfl',
 'nba',
 'GlobalOffensive',
 'AdviceAnimals',
 'videos',
 'gaming',
 'The_Donald',
 'WTF',
 'hockey',
 'soccer',
 'TumblrInAction',
 'atheism',
 'CFB',
 'movies',
 'DotA2',
 'technology',
 'creepyPMs',
 'india',
 'gifs',
 'DestinyTheGame']