In [11]:
from google.cloud import bigquery
from google.oauth2.service_account import Credentials


cred = Credentials.from_service_account_file(
    'nwo-sample-5f8915fdc5ec.json')

BigQuery_client = bigquery.Client(project = "nwo-sample",credentials = cred)

In [73]:
print(BigQuery_client.query("SELECT count(*) from `nwo-sample.graph.reddit`").result().to_dataframe())

        f0_
0  60601557


In [163]:
reddit_query = \
"""
SELECT body as text, subreddit
  
FROM `nwo-sample.graph.reddit`

LIMIT 100000
"""

reddit_results = BigQuery_client.query(reddit_query).result().to_dataframe()

 
print (reddit_results.head())


                                                text           subreddit
0                   Argan oil is $10-15 on Amazon...  Skincare_Addiction
1  Ringworm, possibly? I think it's definitely wo...         Dermatology
2  Were going 100% electric but I like the idea o...     RenewableEnergy
3  I'm sure you can do this. (A junior account is...   eupersonalfinance
4  And I think originally it's $200 but you get $...         AusSkincare


In [164]:
def arr_tags(arr):
    result = []
    for tag in arr:
        result.append(tag)
    return result

reddit_results['tags'] = arr_tags(reddit_results['subreddit'])

In [165]:
#reddit_results = reddit_results.rename(columns={"body": "text"})
#print (reddit_results.head())

In [166]:
tweets_query = \
"""
SELECT tweet as text,hashtags
  
FROM `nwo-sample.graph.tweets`

WHERE ARRAY_LENGTH(hashtags)=1

LIMIT 100000
"""

tweets_results = BigQuery_client.query(tweets_query).result().to_dataframe()

 
print (tweets_results.head())



                                                text  \
0  WATCH: @chrislhayes breaks down Joe Biden's se...   
1  In a span of 12 hours, a video showing the fin...   
2  New #TeawithGaryVee, tomorrow 9am ET ☕️\n\nRSV...   
3  "I look up to you, man." 🤝\n\nA true sign of r...   
4  Honesty, transparency and clarity will be vita...   

                    hashtags  
0          [#firechrishayes]  
1  [#justiceforahmaudarbery]  
2          [#teawithgaryvee]  
3                  [#ufc249]  
4             [#coronavirus]  


In [175]:



def clean_tags(arr):
    result = []
    for line in arr:
        result.append(line[0].replace("#",""))
        #result.append([text.replace("#","") for text in line])
    return result

tweets_results['tags'] = clean_tags(tweets_results['hashtags'])

In [176]:
#tweets_results = tweets_results.rename(columns={"tweet": "text"})
print (tweets_results.head())


                                                text  \
0  WATCH: @chrislhayes breaks down Joe Biden's se...   
1  In a span of 12 hours, a video showing the fin...   
2  New #TeawithGaryVee, tomorrow 9am ET ☕️\n\nRSV...   
3  "I look up to you, man." 🤝\n\nA true sign of r...   
4  Honesty, transparency and clarity will be vita...   

                    hashtags                    tags  
0          [#firechrishayes]          firechrishayes  
1  [#justiceforahmaudarbery]  justiceforahmaudarbery  
2          [#teawithgaryvee]          teawithgaryvee  
3                  [#ufc249]                  ufc249  
4             [#coronavirus]             coronavirus  


In [177]:
import pandas

result = pandas.concat([reddit_results[["text","tags"]],tweets_results[["text","tags"]]])
print (len(result))
print (result.head(20))


200000
                                                 text                 tags
0                    Argan oil is $10-15 on Amazon...   Skincare_Addiction
1   Ringworm, possibly? I think it's definitely wo...          Dermatology
2   Were going 100% electric but I like the idea o...      RenewableEnergy
3   I'm sure you can do this. (A junior account is...    eupersonalfinance
4   And I think originally it's $200 but you get $...          AusSkincare
5                           I live in the Netherlands    eupersonalfinance
6   Thanks for that link, I'm going to read it lat...          algotrading
7   It's crazy how you literally defend Trump on e...     moderatepolitics
8   The one without fragrance is marked as being f...   Skincare_Addiction
9   Only NIFTYBEES is easily tradable and provides...     IndiaInvestments
10  I own 37 from BSE and thought increase to arou...     IndiaInvestments
11  I've interesting point of view. When I had lar...     IndiaInvestments
12  This needs to 

In [53]:
import spacy

nlp = spacy.load("en_core_web_sm")

def getEntities():
    entities =[]
    for line in result["text"]:
        doc = nlp(line)
        entities.append(doc.ents)
    return entities
    
result["entities"]=getEntities()        

print (result.head(20))        

                                                text  \
0                   Argan oil is $10-15 on Amazon...   
1  Ringworm, possibly? I think it's definitely wo...   
2  Were going 100% electric but I like the idea o...   
3  I wanted to suggest infopass but I see you hav...   
4  &gt;There's no reason we shouldn't do everythi...   
5  I'm sure you can do this. (A junior account is...   
6  And I think originally it's $200 but you get $...   
7  Catastrophe Bonds are pretty niche and super i...   
8  I feel you homie. Maybe it's different because...   
9                      Yeah! Down 90% at the moment!   
0  Six-time #F1 world champion Lewis Hamilton has...   
1  #UFC249 will not air in bars, but you can stil...   
2  Sat down on a jab! 😳\n🇸🇷 @JairRozenstruik can ...   
3  New K-pop boy band @CRAVITYstarship had a big ...   
4  Report: Arizona is third worst state for at-ri...   
5  #SidharthShukla and #ShehnaazGill fans root fo...   
6  "As more & more countries consider how to eas

In [139]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z _]')
STOPWORDS = set(stopwords.words('english'))
#nltk.download('punkt')

from nltk.tokenize import WhitespaceTokenizer 
tokenizer = nltk.tokenize.WhitespaceTokenizer()


def text_prepare(text):
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(" ",text)
    text = BAD_SYMBOLS_RE.sub("",text)
    
    #print(text)
    #print(word_tokenize(text))
    #tokenizer = nltk.tokenize.api.StringTokenizer()
    #for w in word_tokenize(text):
    #    if (w[:17] == "conversion failed"):
    #        raise w
    text = " ".join([w for w in  tokenizer.tokenize(text) if not w in STOPWORDS])
    #print(text)
    return text


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\larkw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [178]:
tags =result['tags'].values

def build_tag_dict(seq):
    tags = {}
    
    for elem in seq:
        #elem = elem.strip()
        if elem in tags:
            tags[elem]+=1
        else:
            tags[elem]=1
    return tags
                
def filter_tags(tags):
    result = []
    for tag in tags.items():
        if tag[1]>1:
            result.append(tag[0])
    return result
                
tags = filter_tags(build_tag_dict(tags))

print(len(tags))
print (tags)


10883
['Skincare_Addiction', 'Dermatology', 'RenewableEnergy', 'eupersonalfinance', 'AusSkincare', 'algotrading', 'moderatepolitics', 'IndiaInvestments', 'SkincareAddicts', 'SkincareAddictionUK', '30PlusSkinCare', 'Sephora', 'Forex', 'PersonalFinanceNZ', 'Ask_Politics', 'CryptoCurrency', 'AusFinance', 'geopolitics', 'options', 'CanadianInvestor', 'PanPorn', 'ZeroWaste', 'MachineLearning', 'TechNewsToday', 'Daytrading', 'btc', 'nyc', 'lgbt', 'news', 'space', 'stocks', 'Bitcoin', 'atheism', 'finance', 'science', 'business', 'politics', 'startups', 'Economics', 'RobinHood', 'worldnews', 'RealEstate', 'technology', 'ukpolitics', 'weedstocks', 'AsianBeauty', 'environment', 'gunpolitics', 'immigration', 'pennystocks', 'Conservative', 'Entrepreneur', 'WayOfTheBern', 'UpliftingNews', 'unitedkingdom', 'CanadaPolitics', 'wallstreetbets', 'NeutralPolitics', 'UKPersonalFinance', 'PersonalFinanceCanada', 'justiceforahmaudarbery', 'ufc249', 'coronavirus', 'covid19', 'heattwitter', 'intoamerica', 'no

In [179]:
rslt_df = result[result['tags'].isin(tags)] 
X_train, y_train = rslt_df['text'].values, list(rslt_df['tags'].values)
X_train = [text_prepare(x) for x in X_train]
print (len(X_train))

155953


In [159]:
print (len(X_train))
print (y_train)


9998
['SkincareAddicts', 'Sephora', 'immigration', 'immigration', 'immigration', 'immigration', 'IndiaInvestments', 'immigration', 'moderatepolitics', 'immigration', 'eupersonalfinance', 'moderatepolitics', 'immigration', 'immigration', 'SkincareAddicts', 'IndiaInvestments', 'RenewableEnergy', 'immigration', 'immigration', 'AusSkincare', 'algotrading', 'IndiaInvestments', 'SkincareAddicts', 'algotrading', 'moderatepolitics', 'RenewableEnergy', 'algotrading', 'IndiaInvestments', 'algotrading', 'immigration', 'moderatepolitics', 'moderatepolitics', 'IndiaInvestments', '30PlusSkinCare', 'moderatepolitics', 'SkincareAddicts', 'AusSkincare', 'Skincare_Addiction', 'algotrading', 'SkincareAddicts', 'IndiaInvestments', 'PanPorn', 'CryptoCurrency', 'AusFinance', 'CanadianInvestor', 'options', 'CryptoCurrency', 'AusFinance', 'options', 'PanPorn', 'AusFinance', 'Forex', 'Dermatology', 'Skincare_Addiction', 'immigration', 'immigration', 'immigration', 'PanPorn', 'options', 'AusFinance', 'AusFinanc

In [180]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_features(X_train, norm = "l2"):
    """
        X_train, X_val, X_test — samples        
        return TF-IDF vectorized representation of each sample and vocabulary
    """ 
    # Create TF-IDF vectorizer with a proper parameters choice
    # Fit the vectorizer on the train set
    # Transform the train, test, and val sets and return the result
    
    
    tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2), token_pattern = '(\S+)',
        stop_words = "english", #helps!
        norm=norm,sublinear_tf=False)
    X_train = tfidf_vectorizer.fit_transform(X_train)
    #X_test = tfidf_vectorizer.transform(X_test)
    #X_val = tfidf_vectorizer.transform(X_val)
    
    
    return X_train,tfidf_vectorizer     #tfidf_vectorizer.vocabulary_

X_train_tfidf,vectorizer = tfidf_features(X_train)

In [149]:




from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes = tags)    #(classes=sorted(tags_counts.keys()))
y_train_n = mlb.fit_transform(y_train)





In [183]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier

def train_classifier(X_train, y_train):
    """
      X_train, y_train — training data
      
      return: trained classifier
    """
    
    # Create and fit LogisticRegression wraped into OneVsRestClassifier.
    
    classifier = OneVsRestClassifier(LogisticRegression(solver = "liblinear"),n_jobs=-1)
    #classifier = OneVsRestClassifier(SGDClassifier(loss="hinge", penalty="l1",learning_rate="adaptive", eta0=1, max_iter=100),n_jobs = -1)
    return classifier.fit(X_train, y_train)
    
classifier_tfidf = train_classifier(X_train_tfidf, y_train) #y_train_n if we go multilabel
#y_val_predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)
#y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)


In [184]:
#y_train_inversed = mlb.inverse_transform(y_train_n)
y_train_predicted_labels_tfidf = classifier_tfidf.predict(X_train_tfidf)
#y_train_pred_inversed = mlb.inverse_transform(y_train_predicted_labels_tfidf)
#print(y_train_pred_inversed)
for i in range(3):
    print('text:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_train[i],y_train[i],y_train_predicted_labels_tfidf[i]
        #','.join(y_train[i]),
        #','.join(y_train_predicted_labels_tfidf[i])
    ))

text:	argan oil 1015 amazon
True labels:	Skincare_Addiction
Predicted labels:	AsianBeauty


text:	ringworm possibly think definitely worth dermatologist look
True labels:	Dermatology
Predicted labels:	AsianBeauty


text:	going 100 electric like idea using solar power heater thanks idea
True labels:	RenewableEnergy
Predicted labels:	technology




In [191]:
tdf = vectorizer.transform([text_prepare("iPhone")])
print(tdf)                                 
print(classifier_tfidf.predict(tdf))


  (0, 30109)	1.0
['technology']


In [192]:
def train_classifier(X_train, y_train):
    """
      X_train, y_train — training data
      
      return: trained classifier
    """
    
    # Create and fit LogisticRegression wraped into OneVsRestClassifier.
    
    #classifier = OneVsRestClassifier(LogisticRegression(solver = "liblinear"),n_jobs=-1)
    classifier = OneVsRestClassifier(SGDClassifier(loss="hinge", penalty="l1",learning_rate="adaptive", eta0=1, max_iter=100),n_jobs = -1)
    return classifier.fit(X_train, y_train)
    
classifier_sgd = train_classifier(X_train_tfidf, y_train) #y_train_n if we go multilabel

y_train_predicted_labels_tfidf = classifier_sgd.predict(X_train_tfidf)

for i in range(3):
    print('text:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_train[i],y_train[i],y_train_predicted_labels_tfidf[i]
        #','.join(y_train[i]),
        #','.join(y_train_predicted_labels_tfidf[i])
    ))

text:	argan oil 1015 amazon
True labels:	Skincare_Addiction
Predicted labels:	technology


text:	ringworm possibly think definitely worth dermatologist look
True labels:	Dermatology
Predicted labels:	technology


text:	going 100 electric like idea using solar power heater thanks idea
True labels:	RenewableEnergy
Predicted labels:	technology




In [193]:
print(len(y_train_predicted_labels_tfidf))

155953
