<a href="https://colab.research.google.com/github/HeeSeung-Shin/Section3_pj/blob/master/flask%EC%9D%B4%EC%A0%84%EB%8B%A8%EA%B3%84.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import sys
import json
import requests
import pandas as pd
import sqlite3
from bs4 import BeautifulSoup
import random
import time
import urllib.request
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from tqdm import tqdm
import re
import collections
from wordcloud import STOPWORDS
from scipy.sparse import csr_matrix
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
from wordcloud import WordCloud
from konlpy.tag import Okt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pickler

In [None]:
def get_kobis_movie_data(key):
    """
    kobis api key값을 받아서 영화 제목을 받는다.

    parameter
    
    - key: kobis api 키
    """
    num=0
    # 추출할 colume을 지정해 데이터 프레임을 생성한다.
    kobis_df = pd.DataFrame(columns= ['movieNm', 'genreAlt', 'director','company'])
    while True:
        num+=1
        url = f"http://www.kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieList.json?key={key}&curPage={num}&itemPerPage=100" 
        try: 
            request = requests.get(url)
            movie = json.loads(request.content)
            start =0+100*(num-1)
            end = 100*num
            for idx in range(start,end):
                kobis_df.loc[idx,'movieNm'] = movie['movieListResult']['movieList'][idx%100]['movieNm']
                kobis_df.loc[idx,'prdtYear'] = movie['movieListResult']['movieList'][idx%100]['prdtYear']
                kobis_df.loc[idx,'genreAlt'] = movie['movieListResult']['movieList'][idx%100]['genreAlt']
                kobis_df.loc[idx,'nationAlt'] = movie['movieListResult']['movieList'][idx%100]['nationAlt']
                kobis_df.loc[idx,'director'] = [director['peopleNm'] for director in movie['movieListResult']['movieList'][idx%100]['directors']]
                kobis_df.loc[idx,'company'] = [company['companyNm'] for company in movie['movieListResult']['movieList'][idx%100]['companys']]
        except:
            break
        
        # 에로영화 제외
        kobis_df = kobis_df[-kobis_df['genreAlt'].str.contains('에로')] 

        # 값이 따로 없는 데이터제거
        for col in kobis_df.columns:
            kobis_df = kobis_df[kobis_df[col].astype('bool') == True]
        kobis_df.reset_index(drop=True, inplace=True)
    return kobis_df

In [None]:
def put_df_to_database(database, table, df):
    """
    특정 database와 table을 입력하면 해당 database의 table에 df를 넣어준다.
    만약 table이 존재하지 않는다면 새로 만들어준후 df를 그대로 넣어준다.
    paramter
    - database
      1. movie:
        table 
        - kobis_movie: kobis_api를 통해 얻은 영화 정보 데이터를 넣어준다.
        - naver_movie: naver_api를 통해 얻은 영화 정보 데이터를 넣어준다.
        - naver_review: naver_api에 있는 영화의 리뷰데이터를 넣어준다.
        - modeling_review: 모델링에 사용할 리뷰데이터를 넣어준다.
      2. web_service
        table
        - service_movie: 서비스에 제공할 영화목록과 키워드가 담긴 데이터를 넣어준다.
        - genre_keywords :genre별 keyword가 들어간다.
    df: 테이터베이스에 넣어줄 데이터프레임
    """
    if ((database =='movie.db') & (table in ['kobis_movie', 'naver_movie', 'naver_review', 'modeling_review'])) | ((database =='web_service.db') & (table in ['service_movie', 'genre_keywords'])) :
        conn = sqlite3.connect(database)
        cur = conn.cursor()
        
        try:
            df.to_sql(table,conn)
        except:
            index = cur.execute(f'SELECT index FROM {table} ORDER BY index DESC LIMIT 1').fetchall()[0]
            col_tup=['index']+list(df.columns)
            tup=tuple(['?']*(len(df.columns)+1))
            for idx in range(len(df)):
                cur.execute(f'INSERT INTO {table}{col_tup} VALUES {tup}', [index+idx+1]+df.iloc[idx])
            conn.commit()
    else: 
        print('''
        1. movie:
        table 
        - kobis_movie: kobis_api를 통해 얻은 영화 정보 데이터를 넣어준다.
        - naver_movie: naver_api를 통해 얻은 영화 정보 데이터를 넣어준다.
        - naver_review: naver_api에 있는 영화의 리뷰데이터를 넣어준다.
        - modeling_review: 모델링에 사용할 리뷰데이터를 넣어준다.
        2. web_service
        table
        - service_movie: 서비스에 제공할 영화목록과 키워드가 담긴 데이터를 넣어준다.
        - genre_keywords :genre별 keyword가 들어간다.
        ''')


In [None]:
# 전처리위한 함수
def change_form(l):
    for idx,s in enumerate(l):
        l[idx]=s.strip().lstrip('[').lstrip("'").rstrip(']').rstrip("'")
    return l

In [None]:
def get_Naver_movie_info(client_id,client_secret_id,kobis_df):
    '''
    영화 제목으로 client_id와 client_secret_id, 영화 제목이 담긴 리스트를 받아
    네이버 영화 api를 이용해 링크,이미지,출시년도,감독,배우,별점에 대한 정보를 받는다.

    parameters
    client_id: 네이버 영화 api의 client_id
    client_secret_id: 네이버 영화 api의 client_secret_id
    movie_title_list: 네이버 영화 api의 영화제목 리스트
    '''
    
    movie_title_list= list(kobis_df['movieNm'])

    
    js_list=[]

    for movie_name in movie_title_list:
        num +=1
        if num%10==0:
            time.sleep(2)
            print(num)
        client_id = client_id
        client_secret = client_secret_id
        encText = urllib.parse.quote(movie_name)
        url = "https://openapi.naver.com/v1/search/movie.json?query=" + encText
        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        response = urllib.request.urlopen(request)
        response_body = response.read()
        js_list+=[json.loads(response_body.decode('utf-8'))]
    movie_info=pd.DataFrame(columns=js_list[0]['items'][0].keys())
    movie_info.drop('subtitle',axis=1, inplace=True)
    movie_info['title']= new_service['movieNm']
    idx=-1
    for movie_json in js_list:
        idx+=1
        if movie_json['display']==1:
            movie_info.loc[idx,'link'] = movie_json['items'][0]['link']
            movie_info.loc[idx,'image'] = movie_json['items'][0]['image']
            movie_info.loc[idx,'pubDate'] = movie_json['items'][0]['pubDate']
            movie_info.loc[idx,'director'] = movie_json['items'][0]['director']
            movie_info.loc[idx,'actor'] = movie_json['items'][0]['actor']
            movie_info.loc[idx,'userRating'] = movie_json['items'][0]['userRating']
        else:
            for multi_json in movie_json['items']:
                if (multi_json['title'].lstrip('<b>').rstrip('<\b>')== movie_info.loc[idx,['title']]) & (new_service.loc[idx,['prdtYear']] == float(multi_json['pubDate'])):
                    movie_info.loc[idx,'link'] = movie_json['items'][0]['link']
                    movie_info.loc[idx,'image'] = movie_json['items'][0]['image']
                    movie_info.loc[idx,'pubDate'] = movie_json['items'][0]['pubDate']
                    movie_info.loc[idx,'director'] = movie_json['items'][0]['director']
                    movie_info.loc[idx,'actor'] = movie_json['items'][0]['actor']
                    movie_info.loc[idx,'userRating'] = movie_json['items'][0]['userRating']
                    break
    # NaN값제거
    for col in movie_info.columns:
        movie_info = movie_info[movie_info[col].astype('bool') == True]

    
    movie_info['genre']=kobis_df'genreAlt']
    movie_info['nation']=kobis_df['nationAlt']
    movie_info['genre']=movie_info['genre'].apply(lambda x: x.split(',')).apply(lambda x:change_form(x)).apply(lambda x:",".join(x))
    movie_info['nation']=movie_info['nation'].apply(lambda x: x.split(',')).apply(lambda x:change_form(x)).apply(lambda x:",".join(x))
    
    movie_info.reset_index(drop=True, inplace=True)        
    return movie_info      

In [1]:
def get_service_review(naver_movie_title,naver_movie_link):
    '''
    네이버 영화제목과 url을 받아서 review와 별점이 담긴 데이터프레임을 반환한다.
    
    parameters 
        naver_movie_title: 네이버 영화제목이 담긴 리스트
        naver_movie_link : 네이버 영화 url이 담긴 리스트

    '''
    
    movie_df = pd.DataFrame(columns=['review_text', 'review_star', 'title'])
    idx=0
    for movie_title, movie_link in list(zip(naver_movie_title,naver_movie_link)):
        movie_code= movie_link.split('code=')[1]
        page_url = f"https://movie.naver.com/movie/point/af/list.naver?st=mcode&sword={movie_code}&target=after&page={page_num}"
        page = requests.get(page_url)
        soup = BeautifulSoup(page.content, 'html.parser')
        if len(soup.find('div',class_='paging').find('div').find_all('a')) >10:
            text_list = []
            star_list = []
            for page in range(1,10):
                page_url = f"https://movie.naver.com/movie/point/af/list.naver?st=mcode&sword={movie_code}&target=after&page={page_num}"
                page = requests.get(page_url)
                soup = BeautifulSoup(page.content, 'html.parser')
                score_list=soup.find('table',class_='list_netizen').find('tbody').find_all('tr')
                for reple in score_list:
                    idx+=1
                    movie_df.loc[idx,'review_text'] = reple.find_all('td')[1].select_one('br').next_sibling.strip()
                    movie_df.loc[idx,'review_star'] = int(reple.find('div',class_='list_netizen_score').find('em').text)
                    movie_df.loc[idx,'movie_title']=movie_title

    return movie_df

In [None]:
def get_modeling_review():
    '''
    감성분석 모델학습을 위한 최근 리뷰 10000개를 추출해준다.
    '''

    url = "https://movie.naver.com/movie/point/af/list.nhn?&page="
    headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'}
    
    score_list = []
    comment_list = []
    
    for page in range(1,1000):
        web = requests.get(url+str(page), headers = headers).content
        soup = BeautifulSoup( web, 'html.parser')
        
        star_score_lst = soup.find_all('div',{'class':"list_netizen_score"})
        for star_score in star_score_lst:
            score_list.append(star_score.find('em').text)
            
        comment_lst = soup.find_all('td', {'class':"title"})
        for comment in comment_lst:

            # br class 다음 문자열을 불러옴. next_sibling
            comment_list.append(comment.select_one('br').next_sibling.strip())
        
        interval = round(random.uniform(0.2, 1.2),2)
        time.sleep(interval)
        
    modeling_review = pd.DataFrame({'review_text': comment_list, 'review_score': score_list})
    return modeling_review

In [None]:
# 전처리 위한 함수
def apply_regular_expression(text):
    hangul = re.compile('[^ ㄱ-ㅣ 가-힣]') 
    result = hangul.sub('', text)  
    return result
def text_cleaning(text):
    hangul = re.compile('[^ a-z A-Z ㄱ-ㅣ 가-힣]')  # 정규 표현식 처리
    result = hangul.sub('', text)
    okt = Okt()  # 형태소 추출
    words = okt.morphs(result, norm=True)
    words = [x for x in words if len(x) > 1]  # 한글자 키워드 제거
    words = [x for x in words if x not in stop_words_lst]  # 불용어 제거
    return words
def neg_pos(x):
    if x >=7:
        return 1
    else:
        return 0

In [None]:
def modeling(modeling_review_df):
    '''
    모델링을 위해 수집한 df를 입력받아서 모델링을 하고 현 디렉토리에 pickle파일로 저장
    추가로 예측을위한 데이터프레임을 저장해준다.
    parameter
    modeling_review_df: 모델링을 위해 수집한 리뷰데이터
    '''
    # 말뭉치 만들기
    corpus = "".join(modeling_review_df['review_text'].tolist())
    # 정규식 적용
    corpus_accept = apply_regular_expression(corpus)
    raw_pos_tagged = Okt().pos(corpus, norm=True, stem=True)

    # set 함수를 사용하여 raw_pos_tagged pos값을 가져오고 중복은 제거한 순수한 pos값을 남긴다.
    set_of_tag = set()
    for tag in raw_pos_tagged:
        set_of_tag.add(tag[1])
    
    # 평점 4,5,6점 리뷰 제거
    modeling_review_df=modeling_review_df[-modeling_review_df['review_star'].isin([4,5,6])]
    
    #label 생성
    modeling_review_df['positiveness'] = modeling_review_df['review_star'].apply(lambda x: neg_pos(x))
    #불용어 사전 제작
    stopwords = pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/korean_stopwords.txt").values.tolist()
    stop_words = set([x[0] for x in stopwords])
    movie_words = {'하다', '보다','있다','없다','너무','이다','영화',
            '되다','않다','같다','만들다','그냥','보고','정말',
            '가다','들다','진짜','싶다','정도','오다','많다',
            '연기','배우','그리고','부분','나다','편이','분들',
                '작품','영화','아니다','되는','겁니다','감독','합니다','싶을','같네'}
    stop_words = stop_words.union(movie_words)
    
    # 추후 예측에 사용할 stopword리스트를 넣어준다.
    with open('stop_words_lst.pkl','wb') as pickle_file:
        pickle.dump(stop_words_lst, pickle_file)    

    #단어 전처리
    word_cleaned = []

    for word in raw_pos_tagged:
        if word[1] not in ["Josa", "Eomi", "Punctuation", "Foreign", "Number", "Hashtag", "URL","PreEomi"]: 
            if (len(word[0]) != 1) & (word[0] not in stop_words):
                word_cleaned.append(word[0])
            else :
                stop_words.add(word[0])
    
    stop_words_lst = list(stop_words)
    
    vect = CountVectorizer(tokenizer = lambda x: text_cleaning(x))
    bow_vect = vect.fit_transform(modeling_review_df['review_text'].tolist())
    word_list = vect.get_feature_names()
    count_list = bow_vect.toarray().sum(axis=0)
    
    x_data = tf_idf_vect
    y_data = modeling_review_df['positiveness']
    x_train, x_test, y_train, y_test, df_train, df_test = train_test_split(x_data, y_data, modeling_review_df, test_size = 0.3, random_state=0)
    x_train.shape, y_train.shape

    lr =LogisticRegression(random_state=0)
    lr.fit(x_train, y_train)

    lr_pred = lr.predict(x_test)
    print('val accuracy: %.4f' % accuracy_score(y_test, lr_pred))

    lr =LogisticRegression(random_state=0)
    lr.fit(x_data, y_data)

    #모델 피클링
    with open('model.pkl','wb') as pickle_file:
        pickle.dump(lr, pickle_file)

    coef_pos_index = sorted(((value, index) for index, value in enumerate(lr.coef_[0])), reverse = True)
    coef_neg_index = sorted(((value, index) for index, value in enumerate(lr.coef_[0])), reverse = False)
    invert_index_vectorizer = {v: k for k, v in vect.vocabulary_.items()}
    df_word_and_numb = pd.DataFrame(list(vect.vocabulary_.items()), columns=['word','word_num'])
    df_score_and_numb = pd.DataFrame(coef_pos_index,columns=['score','word_num'])
    df_final = pd.merge(df_word_and_numb, df_score_and_numb ,how = 'inner', on = 'word_num')
    df_final
    # 모델 가중치에 대한 데이터프레임 현 디렉토리에 피클링
    with open('df_final.pkl','wb') as pickle_file:
        pickle.dump(df_final, pickle_file)

In [None]:
# 감성분석을 위한 함수
def pos_cal(text):
    '''
    생성한 모델 기반으로 감성분석의 결과를 호출
    '''
    # input되는 string을 점수로 바꿔주는 작업
    test_text = text
    okt.pos(test_text, norm=True, stem=True)
    clnd_txt = text_cleaning(test_text)

    with open('stop_words_lst.pkl','rb') as pickle_file:
        stop_words_lst = pickle.load(pickle_file)
    with open('df_final.pkl','rb') as pickle_file:
        df_final = pickle.load(pickle_file)

    # 점수 계산기
    sum_of_coef = 0
    count_of_sum = 0
    for text in clnd_txt:
        if text in list(df_final['word']):
            sum_of_coef += float(df_final[df_final['word']==text]['score'])
            count_of_sum += 1

    # 긍정, 부정 판독기
    if count_of_sum != 0:
        final_score = sum_of_coef/count_of_sum
        if final_score >0:
            return 1
        elif final_score <0 :
            return 0
    else:
        return 0

In [None]:
def get_service_movie_info(movie_info_df, movie_review_df):
    '''
    서비스 영화리스트 장르별 키워드에 대한 데이터 프레임 반환
    paramters
    movie_info_df: naver 영화정보 데이터프레임
    movie_review_df : naver 리뷰데이터 데이터프레임

    '''


    ## 600개의 영화 추출(모두 사용하면 keyword추출시 매우 많은 시간이 걸린다.)
    top300 = movie_info_df.sort_values(by='userRating',ascending=False).iloc[:300]
    bottom300 = movie_info_df[movie_info_df['userRating']!=0].sort_values(by='userRating').iloc[:300]
    anal_df=pd.concat([top300,bottom300])

    if 'index' in anal_df.columns:
        anal_df.drop('index',axis=1,inpace=True)
    
    anal_movie = list(anal_df['title'])
    anal_df.reset_index(drop=True, inplace=True)
    movie_review_df = movie_review_df[movie_review_df['movie_title'].isin(anal_movie)]
    ##movie_review_df에서 아무것도 안써있는 리뷰는 제외해줌
    movie_review_df[movie_review_df['review_text']!='']
    
    corpus = "".join(movie_review_df['review_text'].tolist())
    corpus_accept = apply_regular_expression(corpus)
    okt= Okt()
    raw_pos_tagged = okt.pos(corpus, norm=True, stem=True)

    positiveness=anal_review['review_text'].apply(lambda x:pos_cal(x))

    movie_review_df['positiveness'] = positiveness
    
    anal_review= movie_review_df.copy()
    
    if 'index' in anal_review.columns:
        anal_review.drop('index',axis=1,inpace=True)

    anal_df['pos_per']=None
    anal_df['pos_key']=None
    anal_df['neg_key']=None

    idx = -1
    for movieNm in anal_movie:
        idx+=1
        corpus_pos = "".join(anal_review[(anal_review['movie_title']==movieNm)& (anal_review['positiveness']==1)]['review_text'].tolist())
        corpus_neg = "".join(anal_review[(anal_review['movie_title']==movieNm)& (anal_review['positiveness']==0)]['review_text'].tolist())
        corpus_accept_pos = apply_regular_expression(corpus_pos)
        corpus_accept_neg = apply_regular_expression(corpus_neg)
        okt= Okt()
        noun_pos = okt.nouns(corpus_accept_pos)
        noun_neg = okt.nouns(corpus_accept_neg)
        noun_pos_list = []
        for v in noun_pos:
            if (len(v)>=2) and (v not in stop_words):
                noun_pos_list.append(v)
        count_pos=Counter(noun_pos_list)
        noun_list_pos = count_pos.most_common(5)

        noun_neg_list = []
        for v in noun_neg:
            if (len(v)>=2) and (v not in stop_words):
                noun_neg_list.append(v)
        count_neg=Counter(noun_neg_list)
        noun_list_neg = count_neg.most_common(5)
        anal_df.loc[idx,'pos_per']=anal_review['movie_title']==movieNm]['positiveness'].mean()
        anal_df.loc[idx,'pos_key']=",".join([i for i,_ in noun_list_pos])
        anal_df.loc[idx,'neg_key']=",".join([i for i,_ in noun_list_neg])

        genre_list = []
        for i in anal_df['genre']:
            genre_list+=i.split(',') 
        genre_list = list(set(genre_list))

    genre_keywords = pd.DataFrame(columns=['genre','keywords']) 
    genre_keywords['genre']=genre_list
    for idx ,genre in enumerate(genre_list):
        key_list=[]
        for keywords in anal_df[anal_df['genre'].str.contains(genre)].sort_values('pos_per',ascending=False)['pos_key'].iloc[:20]:
            key_list+= keywords.split(',')
        genre_keywords.loc[idx, 'keywords'] = ",".join(list(set(key_list)))
        anal_df['actor']=anal_df['actor'].str.replace('<b>','').replace('</b>','')

    return anal_df,genre_keywords

In [None]:
kobis_df = get_kobis_movie_data(key)
movie_info = get_Naver_movie_info(client_id,client_secret_id,kobis_df)
naver_movie_title = movie_info['title']
naver_movie_link = movie_info['link']


movie_df = get_service_review(naver_movie_title,naver_movie_link)
modeling_review_df = get_modeling_review()
put_df_to_database('movie.db', 'kobis_movie', kobis_df)
put_df_to_database('movie.db', 'naver_movie', naver_info)
put_df_to_database('movie.db', 'naver_review', movie_df)
put_df_to_database('movie.db', 'modeling_review', modeling_revie_df)

modeling(modeling_review_df)

service_movie,genre_keywords = get_service_movie_info(movie_info, movie_df)
put_df_to_database('web_service.db', 'service_movie', service_movie)
put_df_to_database('web_service.db', 'genre_keywords', genre_keywords)