In [None]:
from google.colab import drive

drive.mount("/content/drive")

In [None]:
import os
import re
import string
import nltk
import joblib
import pandas as pd
import tweepy as tw


from textblob import TextBlob
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [None]:
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')
stop_words = stopwords.words("english")
words = set(nltk.corpus.words.words())

In [9]:
API_KEY = os.getenv("API_KEY")
API_SECRET_KEY = os.getenv("API_SECRET_KEY")
ACCESS_TOKEN = os.getenv("ACCESS_TOKEN")
ACCESS_SECRET_TOKEN = os.getenv("ACCESS_SECRET_TOKEN")
BEARER_TOKEN = os.getenv("BEARER_TOKEN")

In [None]:
auth = tw.OAuthHandler(API_KEY, API_SECRET_KEY)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET_TOKEN)
api = tw.API(auth)

try:
    api.verify_credentials()
    print("Authentication OK")
except:
    print("Error during authentication")

In [390]:
search_words = "memorize"
date_since = "2018-11-16"
label ="note"
file_name = 'drive/MyDrive/jarvis_data/note_data_test.csv'
filtered_search = search_words + " -filter:retweets"

tweets = tw.Cursor(api.search,
              q=filtered_search,
              lang="en",
              since=date_since).items(300)

data = [[tweet.id, tweet.text] for tweet in tweets]
data = pd.DataFrame(data=data, 
                    columns=['tweet_id', 'tweet_text'])

data['label'] = label
data.to_csv(file_name, mode='a', index=False, header=None)

In [391]:

src_file = 'drive/MyDrive/jarvis_data/note_data_test.csv'
dest_file = 'drive/MyDrive/jarvis_data/note_data_test_no_duplicates.csv'

data = pd.read_csv(src_file)
data.columns =['tweet_id', 'tweet_text', 'label'] 
data = data.drop_duplicates(subset =["tweet_id"], 
                     keep = 'first') 
data.to_csv(dest_file, mode='w', index=False)

In [253]:
def cleaner(tweet):
  tweet = tweet.lower()
  tweet = ' '.join([word for word in tweet.split(' ') if word not in stop_words])

  tweet = ' '.join(w for w in nltk.word_tokenize(tweet) \
         if w not in stopwords.words('english') or w.isalpha())
  tweet = tweet.encode('ascii', 'ignore').decode()
  tweet = re.sub(r'https*\S+', ' ', tweet)
  tweet = re.sub(r'@\S+', ' ', tweet)
  tweet = re.sub(r'#\S+', ' ', tweet)
  tweet = re.sub(r'\'\w+', '', tweet)
  tweet = re.sub('[%s]' % re.escape(string.punctuation), ' ', tweet)
  tweet = re.sub(r'\w*\d+\w*', '', tweet)
  tweet = re.sub(r'\s{2,}', ' ', tweet)
  tweet = tweet.replace('link', '')
  return tweet

In [392]:
src_file = 'drive/MyDrive/jarvis_data/note_data_test_no_duplicates.csv'
dest_file = 'drive/MyDrive/jarvis_data/clean_note_data.csv'

data = pd.read_csv(src_file)
data['tweet_text'] = data['tweet_text'].apply(cleaner)
data = data.sample(frac=1).reset_index(drop=True)
data.to_csv(dest_file, mode='w', index=False)

In [393]:
google_data_file = 'drive/MyDrive/jarvis_data/clean_google_data.csv'
weather_data_file = 'drive/MyDrive/jarvis_data/clean_weather_data.csv'
wikipedia_data_file = 'drive/MyDrive/jarvis_data/clean_wikipedia_data.csv'
joke_data_file = 'drive/MyDrive/jarvis_data/clean_joke_data.csv'
timer_data_file = 'drive/MyDrive/jarvis_data/clean_timer_data.csv'
stop_data_file = 'drive/MyDrive/jarvis_data/clean_stop_data.csv'
note_data_file = 'drive/MyDrive/jarvis_data/clean_note_data.csv'
data_file = 'drive/MyDrive/jarvis_data/data.csv'

data = pd.read_csv(data_file)
google_data = pd.read_csv(google_data_file, usecols=["tweet_text", "label"], nrows=1000)
weather_data = pd.read_csv(weather_data_file, usecols=["tweet_text", "label"], nrows=1000)
wikipedia_data = pd.read_csv(wikipedia_data_file, usecols=["tweet_text", "label"], nrows=1000)
joke_data = pd.read_csv(joke_data_file, usecols=["tweet_text", "label"], nrows=1000)
timer_data = pd.read_csv(timer_data_file, usecols=["tweet_text", "label"], nrows=1000)
stop_data = pd.read_csv(stop_data_file, usecols=["tweet_text", "label"], nrows=1000)
note_data = pd.read_csv(note_data_file, usecols=["tweet_text", "label"], nrows=1000)

data = pd.DataFrame( np.concatenate( (note_data.values, data.values), axis=0 ) )
data.columns = ['tweet_text', 'label'] 
data = data.drop_duplicates(subset =['tweet_text'], 
                     keep = 'first') 

data = data.sample(frac=1).reset_index(drop=True)
data.to_csv(data_file, mode='w', index=False)

In [None]:
src_file = 'drive/MyDrive/jarvis_data/data.csv'
model_dest_filepath = 'drive/MyDrive/jarvis_data/models/finalized_model.sav'
vectorizer_dest_filepath = 'drive/MyDrive/jarvis_data/models/vectorizer.sav'
data = pd.read_csv(src_file)

vectorizer = CountVectorizer(max_features=100)

vectors = vectorizer.fit_transform(data.tweet_text.to_list()).toarray()

labels = data.label.to_list()

vectors_train, vectors_test, topics_train, topics_test = train_test_split(vectors, labels)


classifier = GaussianNB()
classifier.fit(vectors_train, topics_train)

# # Predict with the testing set
# topics_pred = classifier.predict(vectors_test)

joblib.dump(classifier, model_dest_filepath)
joblib.dump(vectorizer, vectorizer_dest_filepath)

