In [11]:
import praw
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

In [8]:
reddit = praw.Reddit(client_id='MwhK0qtk4ZqRdw', client_secret='cjytrTxD1OR4KtmEyZRk6wY7tfI', user_agent='flair' ,username='gcgvhjchvt2244')
flairs_list = ["AskIndia", "Non-Political", "[R]eddiquette", "Scheduled", "Photography", "Science/Technology", "Politics", "Business/Finance", "Policy/Economy", "Sports", "Food", "AMA"]
sub = reddit.subreddit('india')
data_dict = {"flairs":[], "titles":[], "upvotes":[], "id":[], "url":[], "num_comms": [], "created": [], "content":[], "op":[], "comments":[]}

for flair in flairs_list:
    posts = sub.search(flair, limit=110)
    for post in posts:
        data_dict["flairs"].append(flair)
        data_dict["titles"].append(post.title)
        data_dict["upvotes"].append(post.score)
        data_dict["id"].append(post.id)
        data_dict["url"].append(post.url)
        data_dict["num_comms"].append(post.num_comments)
        data_dict["created"].append(post.created)
        data_dict["content"].append(post.selftext)
        data_dict["op"].append(post.author)

        post.comments.replace_more(limit=None)
        comment = ''
        for top_level_comment in post.comments:
            comment += ' ' + top_level_comment.body
        data_dict["comments"].append(comment)

data_df = pd.DataFrame(data_dict)
print(data_df.head())

     flairs                                             titles  upvotes  \
0  AskIndia  Need feedback for Insurance Policy that I took...        1   
1  AskIndia   Somebody want to kill my full family what to do?       96   
2  AskIndia  Ambassador of India takes back my newly issued...       13   
3  AskIndia             [AskIndia] Cingari, Cengar or Tzengar?        0   
4  AskIndia  Recommendations for books on Indian history wr...       17   

       id                                                url  num_comms  \
0  1s57oi  https://www.reddit.com/r/india/comments/1s57oi...          1   
1  b7pvwt  https://www.reddit.com/r/india/comments/b7pvwt...         24   
2  bdfid1  https://www.reddit.com/r/india/comments/bdfid1...         27   
3  18ntue  https://www.reddit.com/r/india/comments/18ntue...          0   
4  avt1qx  https://www.reddit.com/r/india/comments/avt1qx...          9   

        created                                            content  \
0  1.386254e+09  **Re-postin

In [9]:
data_df['comments']

0        Dear Policy Holder(Dhavalcoholic),\n \nWe req...
1        Calm down.\nGo to the SP office of your town,...
2        Honestly, she and her supervisor behaved *exa...
3                                                        
4        The Discovery of India by J.Nehru.\n\nYou wil...
                              ...                        
1223     I just hope he completely attends the AMA and...
1224     Guys, can someone explain why randia is bashi...
1225     Nobody knows the future. We never saw it cumm...
1226     There is a context to that statement. \n\nPlu...
1227     1 large popcorn + 1 large cold drink = Rs. 21...
Name: comments, Length: 1228, dtype: object

In [14]:

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')

X_train, X_test, y_train, y_test = train_test_split(data_df['titles']+data_df['comments']+data_df['content'], data_df['flairs'], random_state= 0)

fitted_vectorizer = tfidf.fit(X_train)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(X_train)
model = LinearSVC().fit(tfidf_vectorizer_vectors, y_train)
# # (model.predict(fitted_vectorizer.transform([data])))
# with open('model_pickle','wb') as file:
#     pickle.dump(model,file)


from sklearn.externals import joblib

joblib.dump(model, 'model_joblib')
joblib.dump(fitted_vectorizer, 'fitted_vectorizer')

['fitted_vectorizer']

In [13]:

data_df['category_id'] = data_df['flairs'].factorize()[0]
category_id_df = data_df[['flairs', 'category_id']].drop_duplicates()


# Dictionaries for future use
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'flairs']].values)
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2),
                        stop_words='english')

# We transform each complaint into a vector
features = tfidf.fit_transform(data_df.comments+data_df.titles+data_df.content).toarray()

labels = data_df.category_id

print("Each of the %d complaints is represented by %d features (TF-IDF score of unigrams and bigrams)" % (features.shape))
X = data_df['comments']  # Collection of documents
# Target or the labels we want to predict (i.e., the 13 different complaints of flairss)
y = data_df['flairs']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=0)
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(
      model, features, labels, scoring='accuracy', cv=CV)
  for i in range(len(accuracies)):
    entries.append((model_name, i, accuracies[i]))

cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis=1,
                ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']
acc


Each of the 1228 complaints is represented by 10463 features (TF-IDF score of unigrams and bigrams)




Unnamed: 0_level_0,Mean Accuracy,Standard deviation
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
LinearSVC,0.821702,0.043991
LogisticRegression,0.794813,0.050424
MultinomialNB,0.69627,0.034636
RandomForestClassifier,0.785087,0.082937
