In [1]:
"""
Проводим разбивку пользователдей на группы для А/В тестирования и применяем к ним две модели в зависимости от группы
"""

'\nПроводим разбивку пользователдей на группы для А/В тестирования и применяем к ним две модели в зависимости от группы\n'

In [2]:
from fastapi import  FastAPI, Depends, HTTPException
from sqlalchemy.orm import Session
#from schema import PostGet
from typing import List
from json import dumps
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from datetime import datetime
from pydantic import BaseModel
import hashlib
from url import conn_uri

In [3]:
"""
Создадим классы для валидации выходных данных 
"""

class PostGet(BaseModel):
    id: int
    text: str
    topic: str
    
    class Config:
        orm_mode = True

class Response(BaseModel):
    exp_group: str
    recommendations: List[PostGet]


In [4]:
SQLALCHEMY_DATABASE_URL = conn_uri

engine = create_engine(SQLALCHEMY_DATABASE_URL)

SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)


In [5]:
import os
from catboost import CatBoostClassifier

def get_model_path_test(path: str) -> str:
    """
    Функция проверяет где работает модель - локально или на удаленной машине
    """
    if os.environ.get("IS_LMS") == "1":  # проверяем где выполняется код 
        MODEL_PATH = '/workdir/user_input/model_test'
    else:
        MODEL_PATH = path
    return MODEL_PATH

def get_model_path_control(path: str) -> str:
    """
    Функция проверяет где работает модель - локально или на удаленной машине
    """
    if os.environ.get("IS_LMS") == "1":  # проверяем где выполняется код 
        MODEL_PATH = '/workdir/user_input/model_control'
    else:
        MODEL_PATH = path
    return MODEL_PATH

def load_models_test():
    """
    Загружаем модель
    """
    model_path = get_model_path_test("model_Cat_1500it_(2ml)_with_data") 

    model = CatBoostClassifier()  
    model.load_model(model_path, format='cbm') # 

    return model

def load_models_control():
    """
    Загружаем модель
    """
    model_path = get_model_path_control("model_Cat_2000it_(2ml)_with_data_nn") 

    model = CatBoostClassifier()  
    model.load_model(model_path, format='cbm') # 

    return model




In [6]:
def batch_load_sql(query: str) -> pd.DataFrame:
    CHUNKSIZE = 200000
    engine = create_engine(conn_uri)
    
    conn = engine.connect().execution_options(stream_results=True)
    chunks = []
    for chunk_dataframe in pd.read_sql(query, conn, chunksize=CHUNKSIZE):
        chunks.append(chunk_dataframe)
    conn.close()
    return pd.concat(chunks, ignore_index=True)



def load_features_user() -> pd.DataFrame:
    user = batch_load_sql('SELECT * FROM "shestov_user_lesson_22_v2"')
    return user
#Ilya_Shestov_user_lesson_22

def load_features_post_test() -> pd.DataFrame:
    post_sort = batch_load_sql("""SELECT * from "shestov_post_lesson_22_v1.1"
     """)
    return post_sort

def load_features_post_control() -> pd.DataFrame:
    post_sort = batch_load_sql("""SELECT * from "shestov_post_lesson_22_2_pca_nnn"
     """)
    return post_sort

#"ilya_shestov_post_sort_50_lesson_22"

In [7]:
model_cat_test = load_models_test()
model_cat_control = load_models_control()


In [8]:
user = load_features_user().drop(['index'],axis=1)

In [9]:
#post_test = load_features_post_test().drop(['index'],axis=1)
post_test = pd.read_csv('post_transform.csv').drop(['Unnamed: 0'],axis=1)

#post_control = load_features_post_control().drop(['index'],axis=1)
post_control = pd.read_csv('post_transform_nn.csv').drop(['Unnamed: 0'],axis=1)

In [10]:
app = FastAPI()#

In [11]:


def get_group(user, group_count = 2):
    """
    Функция забивает пользователей на группы с помошью хеш функции
    """
    value_str = str(user) + 'first_exp'
    value_num = int(hashlib.md5(value_str.encode()).hexdigest(), 16)
    if  (value_num % group_count) == 0:
        return 'test'
    elif (value_num % group_count) == 1:
        return 'control'


In [12]:
#тестим функцию разбивки
df222 = user.copy()
df222['group'] = user['user_id'].apply(lambda user: get_group(user))
df222

Unnamed: 0,user_id,gender,age,country,city,exp_group,os_iOS,source_organic,group
0,200,1,34,0.876413,0.000123,3,0,0,test
1,201,0,37,0.876413,0.001477,0,0,0,test
2,202,1,17,0.876413,0.002659,4,0,0,control
3,203,0,18,0.876413,0.134028,1,1,0,test
4,204,0,36,0.876413,0.000643,3,0,0,test
...,...,...,...,...,...,...,...,...,...
163200,168548,0,36,0.876413,0.003486,4,0,1,control
163201,168549,0,18,0.876413,0.003848,2,0,1,test
163202,168550,1,41,0.876413,0.011789,4,0,1,control
163203,168551,0,38,0.876413,0.134028,3,1,1,control


In [13]:
#@app.get("/post/recommendations/", response_model=List[PostGet])
def recommended_posts(id: int, time: datetime, limit: int = 5) -> List[Response]:
    """
    Функция принимает на вход id  пользователя, время реакции пользователья на пост и лимит рекомендаций
    Возвращает пользователю рекомендации в количестве limit
    Налету разбивает пользователей на группы использует соответствующую модель
    """
    result_list = []
    exp_group = get_group(id)
    if exp_group == 'test':
        post = post_test
        model_cat = model_cat_test
    
    elif exp_group == 'control':
        post = post_control
        model_cat = model_cat_control
    
    else:
        raise ValueError('unknown group')
        
    
    df_test = pd.merge(
        user[user['user_id']==id], 
        post,how='cross').drop(['user_id'], axis =1).set_index(['post_id'])
    df_test['hour'] = pd.to_datetime(time).hour
    df_test['month'] = pd.to_datetime(time).month
    df_test['dayofweek'] = pd.to_datetime(time).dayofweek
    predict_pr = model_cat.predict_proba(df_test.drop(['text','topic'],axis=1))
    result = pd.DataFrame(predict_pr, index = df_test.index).drop([0], axis=1).sort_values(by =1 ,ascending = False)[:limit]
   
    for i in range(limit):
        id_ = int(result.index[-1+i])
        result_list.append(            
            
                {"id": id_ ,
                "text": str(df_test.loc[id_]['text']),
                "topic":str(df_test.loc[id_]['topic']) }
            
        )

    return [exp_group, result_list]

In [14]:


rec = recommended_posts(id = 204, time = '01.01.2001', limit= 5)

rec2 = recommended_posts(id = 206, time = '01.01.2001', limit= 5)

In [15]:
rec

['test',
 [{'id': 2704,
   'text': 'THIS! THIS! THIS!\n\n#WearAMaskSaveALifePlease #SaturdayThoughts #COVID19 https://t.co/dOheu4t7Nd',
   'topic': 'covid'},
  {'id': 4748,
   'text': 'Sometimes when I hear an A-list cast will be bunched up together for 2 hours in a movie I hope, and pray that it is good, not for the sake of my 10 bucks or 2 hours, but for the sake of these actors careers. In the case of Be Cool, everything went to waste.In the beginning of the film John Travolta (aka Chili Palmer) and a music executive played by James Woods are driving in a car talking about movie sequels, and how most arent good. If you look passed the fact that this scene was shot the same way Quentin Tarrantino filmed his car scene in Pulp Fiction, and listen to the dialogue you cant help but ponder whether this is 1) a disclaimer to the audience that this movie is going to suck, or 2) an attempt to get the audience laughing at the sheer humor of 2 people talking about sequels in a sequel. Oh the i

In [16]:
rec2

['control',
 [{'id': 1395,
   'text': 'Connors boost for British tennis\n\nFormer world number one Jimmy Connors is planning a long-term relationship with the Lawn Tennis Association to help unearth the next Tim Henman.\n\nThe American spent three days at the LTAs annual Elite Performance winter camp in La Manga earlier this week. Britain has the right attitude, said Connors. The more involved I can be with the LTA, the better. A short-term arrangement is just confusing. The kids will ask: What am I doing there? LTA chief executive, John Crowther, added: The relationship that Jimmys already started to develop with the coaches and the players has said to us that wed like some more of it. We want to use Jimmy for a number of weeks a year and we hope this is the beginning of a good long-term relationship.\n\nThe camp played host to more than 30 leading senior and junior players, including Greg Rusedski, Arvind Parmar and Anne Keothavong. La Manga is an amazing site to take a bunch of kids