# Flair Prediction Model

In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
import praw

## Extracting the data from praw

In [2]:
reddit=praw.Reddit(client_id='Xizrpkw0yLJdBQ',client_secret='IPVu1oXIDtd2jjX8S4PJ499E6vk',user_agent='my_reddit_scraper')
subreddit=reddit.subreddit('india')

In [8]:
def get_data(post_id_str,topics_dict):
    post_id_list=post_id_str.split()
    for post_id in post_id_list:
        print("getting next post id")
        submission=reddit.submission(id=post_id)
        topics_dict["flair"].append(str(submission.link_flair_text))
        topics_dict["title"].append(submission.title)
        topics_dict["id"].append(submission.id)
        topics_dict["url"].append(submission.url)
        submission.comments.replace_more(limit=None)
        comment = ''
        numcomments=0
        for top_level_comment in submission.comments:
            comment = comment + ' ' + top_level_comment.body
            numcomments+=1
            if numcomments>60:
                break
        if(len(comment)==0):
            topics_dict['comments'].append("No comments")
        else:
            topics_dict["comments"].append(comment)
    print("data acquired")
    return topics_dict

# Processing the Data

In [9]:

def string_form(value):
    return str(value)

def clean_text(text):
   
    text = BeautifulSoup(text, "lxml").text
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [10]:
stop_words=stopwords.words('english')
lancaster=LancasterStemmer()

In [11]:
def text_process(words): 
    words=str(words)
    if(len(words)==0):
        return [lancaster.stem(word) for word in "No Body"]
    words= BeautifulSoup(words, "lxml").text
    words=words.lower()
    nopunc=[char for char in words if char not in string.punctuation]
    nopunc=''.join(nopunc)
    wordlist= [lancaster.stem(word) for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    wordlist=' '.join(wordlist)
    return wordlist

## URL processing

In [12]:
def url_process(words):
    words=str(words)
    if(len(words)==0):
        return [word for word in "No Body"]
    words= BeautifulSoup(words, "lxml").text
    words=words.lower()
    nopunc=[]
    for char in words:
        if char in string.punctuation:
            nopunc.append(" ")
        else:
            if(char.isdigit()==False):
                nopunc.append(char)
    nopunc=''.join(nopunc)
    stop_words=stopwords.words('english')
    stop_words.append('comments')
    stop_words.append('reddit')
    stop_words.append('https')
    stop_words.append('r')
    stop_words.append('www')
    stop_words.extend(['comments','reddit','https','r','www','com','india','http','html','news'])
    wordlist= [word for word in nopunc.split() if word.lower() not in stop_words]
    wordlist=' '.join(wordlist)
    return wordlist

In [13]:
def text_process2(words):
    words=str(words)
    nopunc=[]
    for char in words:
        if char in string.punctuation:
            nopunc.append(" ")
        else:
            if(char.isdigit()==False):
                nopunc.append(char)
    nopunc=''.join(nopunc)
    stop_words=stopwords.words('english')
    extended_stopwords=['ind','lik','peopl','ev','com','govern','think','bank','tim','work','dont','fuck','new','mak','said','year','nee','want','country','day','giv','thing','good','say','tak','ind','india','peopl']
    stop_words.extend(extended_stopwords)
    wordlist= [word for word in nopunc.split() if word.lower() not in stop_words]
    wordlist=' '.join(wordlist)
    return wordlist

# Loading the model

In [5]:
filename='trained_pipeline_pickle.sav'
model=pickle.load(open(filename, 'rb'))

In [21]:
def predict(post_id):
    topics_dict={"flair":[], "title":[],"id":[], 
                          "url":[],"comments":[]}        
    #get the data
    print("getting the data")
    data_dict=get_data(post_id,topics_dict)
    print("converting to dataframe")
    topics_data=pd.DataFrame(data_dict)
    # Do data cleaning and stemming to get text data in the processed form
    print("cleaning the data")
    topics_data['stem_comments']=topics_data['comments'].apply(lambda x: text_process(x))
    topics_data['stemmed_titles']=topics_data['title'].apply(lambda x: text_process(x))
    topics_data['stemmed_url']=topics_data['url'].apply(lambda x: url_process(x))
    topics_data['title_comments_stem']=topics_data['stemmed_titles']+topics_data['stem_comments']
    #Get the prediction
    print("using model to predict flairs")
    flair_text=topics_data['flair']
    prediction=model.predict(topics_data['title_comments_stem']+topics_data['stemmed_url'])
    print("Model prediction is ",prediction[0], "\n Original post flair is ",topics_data['flair'])    

In [20]:
predict('7nc7wa')

getting the data
getting next post id
data acquired
converting to dataframe
cleaning the data
using model to predict flairs
Model prediction is  AskIndia 
 Original post flair is  Series([], Name: flair, dtype: object)


In [71]:
predict('g3jeid')

Model prediction is  AskIndia 
 Original post flair is  0    AskIndia
Name: flair, dtype: object


  ' that document to Beautiful Soup.' % decoded_markup


In [75]:
predict('eaxdzc')

Model prediction is  Politics 
 Original post flair is  0    Politics
Name: flair, dtype: object


  ' that document to Beautiful Soup.' % decoded_markup


In [34]:
predict('g41sd9')

Model prediction is  Politics 
 Original post flair is  Series([], Name: flair, dtype: object)
