In [428]:
from datapipeline import pipeline
import pandas as pd
import numpy as np
import re

Load in data using our pipeline

In [429]:
pipe = pipeline()
df = pipe.load_csv('posts_2019_all.csv')
df.rename(columns ={'link_flair_css_class': 'type'}, inplace = True)

In [430]:
print(df.shape)
df.head(100)

(236758, 9)


Unnamed: 0,created_utc,title,subreddit,author,num_comments,score,id,type,author_flair_css_class
0,1546300891,Malik Monk with a FEROCIOUS Dunk! Announcer is...,nba,exc99,64,736,abcv10,highlights,Pistons1
1,1546301066,Spurs' Gay out vs. Celtics with injured wrist,nba,bornfromblue,12,11,abcvzx,,
2,1546301178,Kemba Dances Around on The Hardwood!,nba,Kraze_F35,11,35,abcwm2,highlights,ChaHornets2
3,1546301267,What should a player have to do for an organis...,nba,Tr3ywayy,19,0,abcx3a,,
4,1546301353,Malik Monk beats the buzzer going into halftime!,nba,Kraze_F35,3,31,abcxmd,highlights,ChaHornets2
5,1546301432,Evan Fourier Crazy Buzzer Beater to Win the Game,nba,[deleted],0,1,abcy4o,,
6,1546301509,Is Doncic the best white player in the NBA today?,nba,[deleted],72,0,abcylh,,
7,1546301709,Trade thoughts.,nba,[deleted],7,0,abczt4,,
8,1546301756,Anyone else think that KD is better than LeBro...,nba,[deleted],0,1,abd035,,
9,1546301783,Elfrid Payton will start tonight for the Pelic...,nba,mycowsfriend,9,63,abd09f,,


In [431]:
df['type'].replace(np.nan, 'None', inplace = True)
post_types = df['type'].unique()
post_types

array(['highlights', 'None', 'gamethread', 'postgamethread', 'news',
       'discussion', 'miscmedia', 'rostermoves', 'index', 'spoilers',
       'rnba', 'ama', 'trashtalk', 'shitpost', 'meta', 'paywall'],
      dtype=object)

In [432]:
for type in post_types:
    n = df[df['type'] == type].size
    print('Type: {0:15}  number of posts: {1}'.format(type, n))

Type: highlights       number of posts: 151290
Type: None             number of posts: 1843542
Type: gamethread       number of posts: 9135
Type: postgamethread   number of posts: 11169
Type: news             number of posts: 61110
Type: discussion       number of posts: 12834
Type: miscmedia        number of posts: 7623
Type: rostermoves      number of posts: 20403
Type: index            number of posts: 1773
Type: spoilers         number of posts: 10341
Type: rnba             number of posts: 792
Type: ama              number of posts: 45
Type: trashtalk        number of posts: 45
Type: shitpost         number of posts: 9
Type: meta             number of posts: 9
Type: paywall          number of posts: 702


In [433]:
df['title'][df['type'] == 'highlights'].sample(10)

17675     Remembering Charles' Barkley's epic rant on Sa...
207088    Kassius Robertson with the DISGUSTING poster d...
157761    And a happy championship parade Day to this Le...
44524       Dirk Nowitzki's first round in the 3-pt contest
108178                 Curry hits the depe three and smiles
82872               So how much they pay you at USC though?
11357     Remember Fultz's First Media Day? ChickFilA Ho...
9563      [HIGHLIGHT] Giannis dunks and the Bradley Cent...
144601    A closeup slow motion view of KD’s achilles in...
79931     The TNT commentary crew discusses "load manage...
Name: title, dtype: object

In [434]:
df['title'][df['type'] == 'gamethread'].sample(10)

66650    GAME THREAD: Memphis Grizzlies (28-41) @ Washi...
62097    GAME THREAD: Brooklyn Nets (34-33) @ Atlanta H...
55932    GAME THREAD: Oklahoma City Thunder (38-23) @ S...
39737    GAME THREAD: Charlotte Hornets (27-28) @ India...
58879    GAME THREAD: Houston Rockets (38-25) @ Toronto...
64377    GAME THREAD: Minnesota Timberwolves (32-35) @ ...
63412    GAME THREAD: Sacramento Kings (33-32) @ Washin...
48573       [Discussion Thread] Inside the NBA - 2/21/2019
55859    GAME THREAD: Detroit Pistons (29-31) @ Clevela...
91266    GAME THREAD: Detroit Pistons (41-41) @ Milwauk...
Name: title, dtype: object

We should probably take '[Highlight] and 'GAME THREAD' out of the post titles as to not make things too easy to classify


In [435]:
df['original_title'] = df['title'].copy() #just so we have the original somewhere

In [436]:
titles = df['title'].copy()
titles = titles.str.replace(r'GAME THREAD: ', '')
titles = titles.str.replace(r'\[(Highlights)\] ', "")
titles = titles.str.replace(r'\[(Highlights)\] ', "")
titles

0         Malik Monk with a FEROCIOUS Dunk! Announcer is...
1             Spurs' Gay out vs. Celtics with injured wrist
2                      Kemba Dances Around on The Hardwood!
3         What should a player have to do for an organis...
4          Malik Monk beats the buzzer going into halftime!
5          Evan Fourier Crazy Buzzer Beater to Win the Game
6         Is Doncic the best white player in the NBA today?
7                                           Trade thoughts.
8         Anyone else think that KD is better than LeBro...
9         Elfrid Payton will start tonight for the Pelic...
10        Mildly interesting: the Hornets (if they win) ...
11        Dallas Mavericks (17-18) @ Oklahoma City Thund...
12                                                    CP3P0
13        Do you agree with my full All-Star team select...
14        Memphis Grizzlies (18-17) @ Houston Rockets (2...
15               Meet 6 Of The NBA’s NEWEST Breakout STARS!
16                     Is CP3 going to w

In [437]:
# df['title'] = titles
df[['title','original_title','type']][df['type'] == 'highlights'].sample(10)

Unnamed: 0,title,original_title,type
20604,KD draws the shooting foul on Gordon Hayward,KD draws the shooting foul on Gordon Hayward,highlights
28629,Khris Middleton with a circus one hand midrang...,Khris Middleton with a circus one hand midrang...,highlights
10163,First team all-defense Lonzo Ball getting cros...,First team all-defense Lonzo Ball getting cros...,highlights
119319,"Lowry with the quick hands, gets the strip on ...","Lowry with the quick hands, gets the strip on ...",highlights
8566,Austin Rivers freezes up as he falls to the gr...,Austin Rivers freezes up as he falls to the gr...,highlights
205507,I don’t agree with Jalen Roses take,I don’t agree with Jalen Roses take,highlights
236443,"One of the best comebacks in NBA history, Trac...","One of the best comebacks in NBA history, Trac...",highlights
119803,Zion's live reaction to NOLA receiving the #1 ...,Zion's live reaction to NOLA receiving the #1 ...,highlights
173715,[Wojnarowski] He (Kemba) has moved on from Cha...,[Wojnarowski] He (Kemba) has moved on from Cha...,highlights
23584,Gregg Popovich on the Spurs win against the Su...,Gregg Popovich on the Spurs win against the Su...,highlights
