In [36]:
# import std libraries
import numpy as np
import pandas as pd
import time

from IPython.display import HTML
import pickle
import json

#import models
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import NMF 
from sklearn.neighbors import KNeighborsClassifier
from scipy.sparse import csr_matrix

import streamlit as st

###################################################
#FOR MODELS

# Data analysis stack
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning stack
from sklearn.model_selection import (
    train_test_split,
    RepeatedStratifiedKFold,
    GridSearchCV,
    RandomizedSearchCV,
    KFold
)
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler,
    FunctionTransformer,
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    roc_curve,
    roc_auc_score,
    precision_recall_curve,
    log_loss,
    mean_absolute_error
)
from sklearn.utils.validation import check_is_fitted
from scipy.stats import randint, loguniform

# Miscellaneous
import warnings
from datetime import datetime

warnings.filterwarnings("ignore")

#############################################

In [37]:
#pip install streamlit-aggrid
#pip install streamlit

In [38]:
dfN = pd.read_csv('df.csv', index_col=0)
XtrainN = pd.read_csv('Xtrain.csv', index_col=0)
#XtestN = pd.read_csv('Xtest.csv', index_col=0)
ytrain = pd.read_csv('ytrain.csv',dtype=int, index_col=0)
#ytest = pd.read_csv('ytest.csv', index_col=0)
df_trainN = XtrainN.merge(ytrain, left_index=True, right_index=True, how='left')

In [39]:
def recommend_nn(query, model,Rt, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained NMF model. 
    Returns a list of k movie ids.
    """
        
    # 1. construct new_user-item dataframe given the query
    new_user_dataframe =  pd.DataFrame(query, columns=movies['title'], index=['new_user'])
    #print(new_user_dataframe)
    # 1.2. fill the NaN
    new_user_dataframe_imputed = new_user_dataframe.fillna(0) #better mean
    # 2. scoring
    # calculates the distances to all other users in the data!
    similarity_scores, neighbor_ids = model.kneighbors(
    new_user_dataframe_imputed,
    n_neighbors=15,
    return_distance=True
    )

    # sklearn returns a list of predictions
    # extract the first and only value of the list

    neighbors_df = pd.DataFrame(
    data = {'neighbor_id': neighbor_ids[0], 'similarity_score': similarity_scores[0]}
    )
    
    # 3. ranking
    # only look at ratings for users that are similar!
    neighborhood = Rt.iloc[neighbor_ids[0]]
  
    
        # filter out movies already seen by the user
    neighborhood_filtered = neighborhood.drop(query.keys(),axis=1)
   

    # calculate the summed up rating for each movie
    # summing up introduces a bias for popular movies
    # averaging introduces bias for movies only seen by few users in the neighboorhood

    df_score = neighborhood_filtered.sum().sort_values(ascending=False)
    
    # return the top-k highest rated movie ids or titles
    df_score_ranked = df_score.sort_values(ascending=False).index.tolist()
    recommended = df_score_ranked[:k]
    return recommended#, df_score

In [40]:
def recommend_nmf(query, model, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained NMF model. 
    Returns a list of k movie ids.
    """
    
    # 1. construct new_user-item dataframe given the query(votings of the new user)
   
    new_user_dataframe = pd.DataFrame(query, columns=movies['title'], index=['new_user'])
   
    new_user_dataframe_imputed =new_user_dataframe.fillna(0)

    P_new_user_matrix = model.transform(new_user_dataframe_imputed)
    # get as dataframe for a better visualizarion
    P_new_user = pd.DataFrame(P_new_user_matrix, 
                         columns = model.get_feature_names_out(),
                         index = ['new_user'])
    
    Q_matrix = model.components_
    Q = pd.DataFrame(Q_matrix, columns=movies['title'], index=model.get_feature_names_out())

    R_hat_new_user_matrix = np.dot(P_new_user,Q)
    # get as dataframe for a better visualizarion
    R_hat_new_user = pd.DataFrame(data=R_hat_new_user_matrix,
                         columns=movies['title'],
                         index = ['new_user'])
    R_hat_new_filtered = R_hat_new_user#.drop(new_user_query.keys(), axis=1)
    R_hat_new_filtered.T.sort_values(by=["new_user"], ascending=False).index.tolist()
    ranked =  R_hat_new_filtered.T.sort_values(by=["new_user"], ascending=False).index.tolist()
    recommended = ranked[:k]
    return recommended#, R_hat_new_filtered.T.sort_values(by=["new_user"], ascending=False)

In [41]:
BEST_MOVIES = df_trainN #pd.read_csv("best_movies.csv")
#BEST_MOVIES.rename(
  #  index=lambda x: x+1,
   # inplace=True
   # )
TITLES = ["---"] + list(BEST_MOVIES['p1'].sort_values()) 

In [42]:
with open('model_rdmf.pkl', 'rb') as file:
    DISTANCE_MODEL = pickle.load(file)

with open('model_rdmf.pkl', 'rb') as file:
    NMF_MODEL = pickle.load(file)

In [43]:
# sidebar
with st.sidebar:
    # title
    st.title("Virus Detector!")
    # image
    st.image('virus.png')
    # blank space
    st.write("")
    # selectbox
    page = st.selectbox(
        "Detecting Models:",
        [
            "--------",
            "Visualization",
            "Machine Learning",
            "Deep Learning"
            ]
        ) 

if page == "--------":
    # slogan
    st.write("""
    *Deluxe virus hunting - attack with virus detection*
    """)
    # blank space
    st.write("")
    # image
    st.image('viruswallpaper.png')

##########################################################
# Visualization
##########################################################

elif page == "Visualization":
    # title
    st.title("Visualization")
    col1,col2,col3,col4 = st.columns([10,2,5,5])
    with col1:
        n = st.slider(
        label="how many graphs?",
        min_value=1,
        max_value=3,
        value=5
        ) 
    with col3:
        st.markdown("####")
        genre = st.checkbox("show raw data")
    with col4:
        st.markdown("###")
        show_button = st.button(label="show graph") 
    
    if genre:
        popular_movies = BEST_MOVIES[['p1','p80']]
    else:
        popular_movies = BEST_MOVIES[['p1']]
       

    st.markdown("###")
    if show_button:
        st.write(
            HTML(popular_movies.head(n).to_html(escape=False))
            )

##########################################################
# Machine Learning
##########################################################

elif page == "Machine Learning":
    # title
    st.title("Machine Learning")
    #
    col1,col2,col3 = st.columns([10,1,5])
    with col1:
        m1 = st.selectbox("model 1", TITLES)
        st.write("")
        m2 = st.selectbox("model 2", TITLES)
        st.write("")
        m3 = st.selectbox("model 3", TITLES)
        st.write("")
        m4 = st.selectbox("model 4", TITLES)
        st.write("")
        m5 = st.selectbox("model 5", TITLES) 
    
    with col3:
        r1 = st.slider(
            label="activate",
            min_value=0,
            max_value=1,
            value=0
            ) 
        r2 = st.slider(
            label="activate",
            min_value=0,
            max_value=1,
            value=0
            ) 
        r3 = st.slider(
            label="activate",
            min_value=0,
            max_value=1,
            value=0
            ) 
        r4 = st.slider(
            label="activate",
            min_value=0,
            max_value=1,
            value=0
            ) 
        r5 = st.slider(
            label="activate",
            min_value=0,
            max_value=1,
            value=0
            ) 

    query_movies = [m1,m2,m3,m4,m5]
    query_ratings = [r1,r2,r3,r4,r5]
    
    user_query = dict(zip(query_movies,query_ratings))

    # get user query
    st.markdown("###")
    user_query_button = st.button(label="train machine learning model") 
    if user_query_button:
        json.dump(
            user_query,
            open("user_query.json",'w')
            )
        st.write("")
        st.write("model prediction done")



##########################################################
# Deep Learning
##########################################################
else:
    # title
    st.title("Deep Learning")
    col1,col2,col3,col4,col5 = st.columns([1,5,1,5,1])
    with col2:
        recommender = st.radio(
            "recommender type",
            ["NMF Recommender","Distance Recommender"]
            )
        #st.write("This is under Recomemnder Matthias!")
    with col4:
        st.write("###")
        recommend_button = st.button(label="recommed movies")

    #load user query
    user_query = json.load(open("user_query.json"))

    # 2. scoring
    
        # calculate the score with the NMF model
    
    
    # 3. ranking
    
        # filter out movies already seen by the user
    
        # return the top-k highest rated movie ids or titles
    
    
    if recommend_button:
        if recommender == "NMF Recommender":
            #recommend_nmf(user_query, NMF_MODEL, k=10)
            AgGrid(BEST_MOVIES.head(10))
        elif recommender == "Distance Recommender":
            AgGrid(BEST_MOVIES.tail(5))
        #    #recommend_nn(user_query, DISTANCE_MODEL, k=10)
      #  else:
      #      pass
        #          st.write("error with chosing recomender system")

    #else:
           # st.write("Push the button!")
    



In [44]:
'''

new_user_query = {"Toy Story (1995)":5,
                 "Grumpier Old Men (1995)":2,
                 "Casino (1995)":3.5,
                 "Sabrina (1995)":4,
                 "GoldenEye (1995)":5}
print(new_user_query)


#AgGrid(BEST_MOVIES.head(20))
print("end")
#print([movies])
new_user_dataframe = pd.DataFrame(new_user_query, columns=movies['title'], index=[0])
new_user_dataframe_imputed =new_user_dataframe.fillna(0)
#type(new_user_dataframe)
#print(BEST_MOVIES)
#Ru = pd.DataFrame(data=new_user_dataframe_imputed, columns=movies['title'],index = UserId)
print(new_user_dataframe_imputed)
similarity_scores, neighbor_ids = DISTANCE_MODEL.kneighbors(
    new_user_dataframe_imputed,
    n_neighbors=15,
    return_distance=True
    )
'''

'\n\nnew_user_query = {"Toy Story (1995)":5,\n                 "Grumpier Old Men (1995)":2,\n                 "Casino (1995)":3.5,\n                 "Sabrina (1995)":4,\n                 "GoldenEye (1995)":5}\nprint(new_user_query)\n\n\n#AgGrid(BEST_MOVIES.head(20))\nprint("end")\n#print([movies])\nnew_user_dataframe = pd.DataFrame(new_user_query, columns=movies[\'title\'], index=[0])\nnew_user_dataframe_imputed =new_user_dataframe.fillna(0)\n#type(new_user_dataframe)\n#print(BEST_MOVIES)\n#Ru = pd.DataFrame(data=new_user_dataframe_imputed, columns=movies[\'title\'],index = UserId)\nprint(new_user_dataframe_imputed)\nsimilarity_scores, neighbor_ids = DISTANCE_MODEL.kneighbors(\n    new_user_dataframe_imputed,\n    n_neighbors=15,\n    return_distance=True\n    )\n'

In [45]:
#recommend_nn(new_user_dataframe_imputed, DISTANCE_MODEL,Rt, k=10)