In [43]:
from google.auth.transport.requests import Request
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from cred import key
import pandas as pd
import numpy as np

from time import sleep 

import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart

from watcher import entries_processed

import category_encoders as ce

import warnings
warnings.filterwarnings('ignore')

In [64]:
def parse(data, encoder):
    """
    Parsing data to the same format as in "students-all.csv"
    
    returns: nicks array and dataframe
    """
    nicks = np.array(data['Nick'])
    mails = np.array(data['Feedback mail'])
    data = data.iloc[:, 2:-1]

    column_names = [
        'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob',
        'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures',
        'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'internet',
        'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences',
        'G1', 'G2', 'G3'
    ]

    data.columns = column_names

    # deafults
    data['school'] = np.repeat("GP", data.shape[0])
    data['major'] = np.repeat("mat", data.shape[0])
    data['higher'] = np.repeat("yes", data.shape[0])

    # corect order
    data = data[[
        'school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu',
        'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime',
        'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities',
        'nursery', 'higher', 'internet', 'famrel', 'freetime', 'goout', 'Dalc',
        'Walc', 'health', 'absences', 'G1', 'G2', 'G3', 'major'
    ]]
    
    data['age'] = pd.to_numeric(data['age'])
    data['sex'] = np.where(data.sex == 'Male', 'M', 'F')
    data['address'] = np.where(data.address == 'Rural', 'R', 'U')
    data['famsize'] = np.where(data.famsize == 'more than 3', 'GT3', 'LE3')
    data['Pstatus'] = np.where(data.Pstatus == 'living apart', 'A', 'T')

    d1 = {
        'none': 0,
        'primary education': 1,
        'middle school': 2,
        'high school': 3,
        'higher education': 4
    }

    data['Medu'] = [d1[item] for item in list(data.Medu)]
    data['Fedu'] = [d1[item] for item in list(data.Fedu)]

    d2 = {
        'teacher': 'teacher',
        'healthcare': 'health',
        'civil services': 'civil',
        'home': 'at_home',
        'other': 'other'
    }

    data['Mjob'] = [d2[item] for item in list(data.Mjob)]
    data['Fjob'] = [d2[item] for item in list(data.Fjob)]

    d3 = {
        'close to home': 'home',
        'school reputation': 'reputation',
        'course preference': 'course',
        'other': 'other'
    }

    data['reason'] = [d3[item] for item in list(data.reason)]
    
    d4 = {'Mother':'mother', 'Father':'father', 'other':'other'}
    
    data['guardian'] = [d4[item] for item in list(data.guardian)]

    d5 = {'< 15':1, '15-30':2, '30-60':3,'60 >':4}
    
    data['traveltime'] = [d5[item] for item in list(data.traveltime)]
    
    d6 = {'< 2':1, '2-5':2, '5-10':3,'10 >':4}
    
    data['studytime'] = [d6[item] for item in list(data.studytime)]
    
    data['G1'] = np.array(((pd.to_numeric(data.G1) -2 )/3)*20 , dtype = 'int64')
    data['G2'] = np.array(((pd.to_numeric(data.G2) -2 )/3)*20 , dtype = 'int64') 
    data['G3'] = np.array(((pd.to_numeric(data.G3) -2 )/3)*20 , dtype = 'int64')
    
    data['absences'] = np.array(pd.to_numeric(data.absences) , dtype = 'int64')
    data['failures'] = np.array(data.failures, dtype = 'int64')
    data['famrel']   = np.array(data.famrel, dtype = 'int64')
    data['freetime'] = np.array(data.freetime, dtype = 'int64')
    data['goout']    = np.array(data.goout, dtype = 'int64')
    data['Dalc']     = np.array(data.Dalc, dtype = 'int64')
    data['Walc']     = np.array(data.Walc, dtype = 'int64')
    data['health']   = np.array(data.health, dtype = 'int64')
    
    
    data_transformed = encoder.transform(data)
    
    return nicks, mails, data_transformed

In [65]:
def get_new_entries(entries_processed,
                    encoder,
                    rows = 5000, 
                    SAMPLE_SPREADSHEET_ID = '1e1tWLI0vD05bUj-wLWicOnl0iU-GWz0aaWEtRDlTQ2M'):
    
    cols = "AG"
    SAMPLE_RANGE_NAME = 'A1:'+ cols + str(rows)
    result = sheet.values().get(spreadsheetId=SAMPLE_SPREADSHEET_ID,
                                range=SAMPLE_RANGE_NAME).execute()
    values = result.get('values', [])
    data = pd.DataFrame(values)
    data.columns = data.iloc[0,:]
    data = data.iloc[1+entries_processed:,:]
    print('Processing from rows: {}. Got {} new entries'.format(entries_processed, data.shape[0]))
    
    n,m,df = parse(data, encoder)
    return n,m,df

In [66]:
def predict_new_entries(df, model):
    
    y_predict = model.predict(df)
    y_predict_prob = model.predict_proba(df)[:,1]
    
    return y_predict, y_predict_prob

In [67]:
def send_response(nick, send_to_email, result, prob):
    email = "AlcoholRomantics@gmail.com"
    password = "Fajny!Alkohol0"

    subject = "Student, do you have partner?"
    message = 'Hi, {}'.format(nick)
    message += "\n\nThank you for participating in our survey! Here are your results: "

    if result == 0:
        message += '\nForever alone'
    else:
        message += '\nYou have partner!'
    
    message += '\nYour probability of being in romantic relationship is: {}'.format(prob)
    message += '\n\nThis result was predicted by machine learning model trained on Student Alcohol Consumption dataset from University Of Camerino'
    
    message += '\n\nSee you next time!'
    
    msg = MIMEMultipart()
    msg["From"] = email
    msg["To"] = send_to_email
    msg["Subject"] = subject

    msg.attach(MIMEText(message, 'plain'))

    try: 
        server = smtplib.SMTP("smtp.gmail.com", 587)
        server.starttls()
        server.login(email, password)
        text = msg.as_string()
        server.sendmail(email, send_to_email, text)
        server.quit()
        print('Sent mail to {}, {}'.format(nick, send_to_email))
    
    except:
        print('Error in sending')
    return

In [68]:
def notify_users(n,m,y_predict, y_predict_prob, sleep_time = 0.5):
    for i in range(len(n)):
        if m[i]:
            send_response(n[i], m[i], y_predict[i], y_predict_prob[i])
            sleep(sleep_time)
    return

In [69]:
def update(model, encoder, small_sleep_time):
    
    f = open("./watcher.py", "r")
    entries_processed = int(f.read()[18:])
    f.close()
    
    n,m,df = get_new_entries(entries_processed, encoder)
    
    if len(n) == 0: return
    
    y_predict, y_predict_prob = predict_new_entries(df, model)
    notify_users(n,m,y_predict, y_predict_prob)
    
    f = open("./watcher.py", "w")
    f.write("entries_processed={}".format(entries_processed + len(n)))
    f.close()
    
    return

In [70]:
def inf_update(model,
               encoder, 
               big_sleep_time=60, 
               small_sleep_time=0.5):
    
    while True:
        update(model, encoder, small_sleep_time)
        sleep(big_sleep_time)

In [71]:
service = build('sheets', 'v4',developerKey=key)
sheet = service.spreadsheets()
#SAMPLE_SPREADSHEET_ID = '1e1tWLI0vD05bUj-wLWicOnl0iU-GWz0aaWEtRDlTQ2M'

In [72]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV

df = pd.read_csv("students-all.csv")
# remove rownames
df = df.iloc[:,1:]

encoder = ce.OneHotEncoder()

df = pd.read_csv("students-all.csv").iloc[:,1:]
y = df.romantic
df = df.drop("romantic", axis =1)

df_one_hot = encoder.fit_transform(df)

target = np.where(y=='yes', 1, 0)

X_train, X_test, y_train, y_test = train_test_split(df_one_hot,
                                                    target, test_size = 0.2, random_state = 666)

'''
xgb_model = xgb.XGBClassifier(max_depth = 7,
                              booster = "dart",
                              colsample_bytree = 0.3,
                              learning_rate = 0.39,
                              reg_alpha = 0.9,
                              reg_lambda = 1.8,
                              subsample = 0.8)
xgb_model.fit(X_train,y_train)
'''

xgb_model = xgb.XGBClassifier()
xgb_model.load_model('alco.model')

#y_prob = xgb_model.predict_proba(X_test)[:,1]

#y_predict_prob = xgb_model.predict_proba(df)[:,1]

XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.39, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0.9,
              reg_lambda=1.8, scale_pos_weight=1, subsample=0.8,
              tree_method=None, validate_parameters=False, verbosity=None)

In [77]:
y_predict_prob = xgb_model.predict_proba(X_test)[:,1]
y_pres = xgb_model.predict(X_test)
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_predict_prob)

0.8041666666666667

In [75]:
inf_update(xgb_model,encoder, big_sleep_time=20)

Processing from rows: 12. Got 0 new entries
Processing from rows: 12. Got 0 new entries
Processing from rows: 12. Got 0 new entries
Processing from rows: 12. Got 0 new entries
Processing from rows: 12. Got 0 new entries
Processing from rows: 12. Got 1 new entries
Sent mail to przysiąść fałdów, aleksandrap1869@gmail.com
Processing from rows: 13. Got 0 new entries


KeyboardInterrupt: 