베이스라인

In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

view_log_train = pd.read_csv('view_log.csv')
article_info = pd.read_csv('article_info.csv')
submission = pd.read_csv('sample_submission.csv')

# 사용자-기사 행렬 생성
user_article_matrix = view_log_train.groupby(['userID', 'articleID']).size().unstack(fill_value=0)

# 사용자 간의 유사성 계산
user_similarity = cosine_similarity(user_article_matrix)

user_similarity

# 추천 점수 계산
user_predicted_scores = user_similarity.dot(user_article_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T

# 이미 조회한 기사 포함해서 추천
recommendations = []
for idx, user in enumerate(user_article_matrix.index):
    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
    sorted_indices = user_predicted_scores[idx].argsort()[::-1]
    top5recommend = [article for article in user_article_matrix.columns[sorted_indices]][:5]

    for article in top5recommend:
        recommendations.append([user, article])

# sample_submission.csv 형태로 DataFrame 생성
top_recommendations = pd.DataFrame(recommendations, columns=['userID', 'articleID'])

submission['articleID'] = top_recommendations['articleID']

submission.to_csv('baseline_submission.csv', index=False)


In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

view_log_train = pd.read_csv('view_log.csv')
article_info = pd.read_csv('article_info.csv')

In [None]:
article_info['Language'].unique()

array(['en', 'pt', 'es', 'la', 'ja'], dtype=object)

In [None]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m839.7/981.5 kB[0m [31m27.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=c71311d39f9e8f2d3e2cd6e67633ca86f4edbedaf0d0157b62af4bb88e57a377
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from langdetect import detect, DetectorFactory

# 필요한 데이터 다운로드 (처음 한 번만 실행)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# langdetect의 비결정적 결과를 방지하기 위해 시드 설정
DetectorFactory.seed = 0

def clean_text(text, use_stemming=False, use_lemmatization=False):
    # 1. 소문자 변환
  #  text = text.lower()

    # 2. HTML 태그 제거
    text = re.sub(r'<.*?>', ' ', text)

    # 3. 구두점 제거
    text = text.translate(str.maketrans('', '', string.punctuation))

    # 4. 숫자 제거
    text = re.sub(r'\d+', '', text)

    # 5. 불필요한 공백 제거
    text = re.sub(r'\s+', ' ', text).strip()

    # 6. 언어 감지
    try:
        lang = detect(text)
    except:
        lang = 'en'  # 기본값 영어

    # 7. 토큰화
    tokens = nltk.word_tokenize(text)

    # 8. 불용어 제거
    if lang == 'pt':
        stop_words = set(stopwords.words('portuguese'))
    else:
        stop_words = set(stopwords.words('english'))

    tokens = [word for word in tokens if word not in stop_words]

    # 9. 어간 추출 또는 표제어 추출
    if use_stemming:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
    elif use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # 정제된 텍스트 반환
    cleaned_text = ' '.join(tokens)
    return cleaned_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
view_log_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42717 entries, 0 to 42716
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   userID       42717 non-null  object
 1   articleID    42717 non-null  object
 2   userRegion   42717 non-null  object
 3   userCountry  42717 non-null  object
dtypes: object(4)
memory usage: 1.3+ MB


In [None]:
article_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3008 entries, 0 to 3007
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   articleID    3008 non-null   object
 1   Title        3008 non-null   object
 2   Content      3008 non-null   object
 3   Format       3008 non-null   object
 4   Language     3008 non-null   object
 5   userID       3008 non-null   object
 6   userCountry  659 non-null    object
 7   userRegion   657 non-null    object
dtypes: object(8)
memory usage: 188.1+ KB


usercountry userregion 결측값 많음 다 nan 으로 넣을지...

In [None]:
article_info['userCountry'].unique()

array([nan, 'BR', 'AU', 'US', 'CA', 'PT'], dtype=object)

In [None]:
# 사용자-기사 행렬 생성
user_article_matrix = view_log_train.groupby(['userID', 'articleID']).size().unstack(fill_value=0)

user_article_matrix.shape # 사용자 1415명, 기사 종류 2879개

(1415, 2879)

In [None]:
article_info['Title']

0                            19 Tips For Everyday Git Use
1       Intel buys computer vision startup Itseez to i...
2            Practical End-to-End Testing with Protractor
3       Corporate venture growth in Brazil is another ...
4       Cross-channel user experiences with Drupal (aw...
                              ...                        
3003    Como consumir conteúdo de qualidade em iOS - C...
3004                               Aurelia 1.0 is Here!!!
3005    Lessons from converting an app to 100% Kotlin ...
3006    ITA está oferecendo 10 cursos gratuitos a dist...
3007              Analytics startup Amplitude raises $15M
Name: Title, Length: 3008, dtype: object

In [None]:
# 사용자 간의 유사성 계산
user_similarity = cosine_similarity(user_article_matrix)

user_similarity

array([[1.        , 0.        , 0.        , ..., 0.02571722, 0.        ,
        0.01028689],
       [0.        , 1.        , 0.        , ..., 0.        , 0.00847884,
        0.02581989],
       [0.        , 0.        , 1.        , ..., 0.        , 0.06495046,
        0.        ],
       ...,
       [0.02571722, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.00847884, 0.06495046, ..., 0.        , 1.        ,
        0.        ],
       [0.01028689, 0.02581989, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [None]:
# 추천 점수 계산
user_predicted_scores = user_similarity.dot(user_article_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T

# 이미 조회한 기사 포함해서 추천
recommendations = []
for idx, user in enumerate(user_article_matrix.index):
    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
    sorted_indices = user_predicted_scores[idx].argsort()[::-1]
    top5recommend = [article for article in user_article_matrix.columns[sorted_indices]][:5]

    for article in top5recommend:
        recommendations.append([user, article])

In [None]:
submission = pd.read_csv('./sample_submission.csv')
# sample_submission.csv 형태로 DataFrame 생성
top_recommendations = pd.DataFrame(recommendations, columns=['userID', 'articleID'])

submission['articleID'] = top_recommendations['articleID']

submission.to_csv('baseline_submission.csv', index=False)

In [None]:
pearson = np.corrcoef(user_article_matrix)
pearson = pd.DataFrame(pearson, index=user_article_matrix.index, columns=user_article_matrix.index)

In [None]:
# 추천 점수 계산
user_predicted_scores = user_similarity.dot(user_article_matrix) / np.array([np.abs(pearson).sum(axis=1)]).T

# 이미 조회한 기사 포함해서 추천
recommendations = []
for idx, user in enumerate(user_article_matrix.index):
    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
    sorted_indices = user_predicted_scores[idx].argsort()[::-1]
    top5recommend_pearson = [article for article in user_article_matrix.columns[sorted_indices]][:5]

    for article in top5recommend_pearson:
        recommendations.append([user, article])

In [None]:
# sample_submission.csv 형태로 DataFrame 생성
top_recommendations = pd.DataFrame(recommendations, columns=['userID', 'articleID'])

submission['articleID'] = top_recommendations['articleID']

submission.to_csv('baseline_pearson_similarity_submission.csv', index=False)

사용자-아이템 기반 추천 시스템 활용

- 9.6 아이템 기반 최근접 이웃 협업 필터링 실습 응용 ( 사용자 아이디만 사용하고 나머지 정보 생략)

## 콘텐츠 기반 필터링 실습 9.5 오 이거 하면 되겠다

기사 제목 내용 합쳐서 텍스트 가공하고 movies_df 했던 것처럼 딕셔너리 형태로 행렬만들기-> countvectorizer로 피처벡터화해서 코사인 유사도 비교 기사별로 유사도 높은 레코드 순서대로 뽑아서 차등 점수 주고 개인 정보 표에서 점수 합산해서 추천 등의 방법으로 진행


In [None]:
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from nltk.tokenize import word_tokenize
# 필요한 데이터 다운로드 (처음 한 번만 실행)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def clean_text(txt):
    """""
    cleans the input text by following the steps:
    * replace contractions
    * remove punctuation
    * split into words
    * remove stopwords
    * remove leftover punctuations
    """""
    contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because",
                        "could've": "could have", "couldn't": "could not", "didn't": "did not",
                        "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not",
                        "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is",
                        "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have"}
    def _get_contractions(contraction_dict):
        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
        return contraction_dict, contraction_re

    def replace_contractions(text):
        contractions, contractions_re = _get_contractions(contraction_dict)
        def replace(match):
            return contractions[match.group(0)]
        return contractions_re.sub(replace, text)

    # replace contractions
    txt = replace_contractions(txt)

    #remove punctuations
    txt  = "".join([char for char in txt if char not in string.punctuation])
    #remove numbers
    txt = re.sub('[0-9]+', '', txt)
    #txt = txt.replace(r"[^A-Za-z0-9()!?\'\`\"]", " ", regex = True )
    txt = txt.lower() # lowercase
   # txt = txt.replace(r"\#","", regex = True ) # replaces hashtags
    #txt = txt.replace(r"http\S+","URL", regex = True )  # remove URL addresses
  #  txt = txt.replace(r"@","", regex = True )
  #  txt = txt.replace("\s{2,}", " ", regex = True ) # remove multiple contiguous spaces

    # split into words
    words = word_tokenize(txt)

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]

    # removing leftover punctuations
    words = [word for word in words if word.isalpha()]

    cleaned_text = ' '.join(words)
    return cleaned_text

# clean train and test tweets
article_info['Title'] = article_info['Title'].apply(lambda txt: clean_text(txt))
article_info['Content'] = article_info['Content'].apply(lambda txt: clean_text(txt))

article_info.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,articleID,Title,Content,Format,Language,userID,userCountry,userRegion
0,ARTICLE_0000,tips everyday git use,ive using git full time past years wanted shar...,HTML,en,USER_0683,,
1,ARTICLE_0001,intel buys computer vision startup itseez impr...,intel acquired computer vision machine learnin...,HTML,en,USER_1129,,
2,ARTICLE_0002,practical endtoend testing protractor,one reasons angularjs great work developed aro...,HTML,en,USER_0256,,
3,ARTICLE_0003,corporate venture growth brazil another sign m...,despite recent positive news renewed interest ...,HTML,en,USER_1304,,
4,ARTICLE_0004,crosschannel user experiences drupal awesome v...,last year around time wrote big reverse web wo...,HTML,en,USER_0336,,


In [None]:
article_info['Title'] = article_info['Title'].apply(lambda x: x.split())

In [None]:
article_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3008 entries, 0 to 3007
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   articleID    3008 non-null   object
 1   Title        3008 non-null   object
 2   Content      3008 non-null   object
 3   Format       3008 non-null   object
 4   Language     3008 non-null   object
 5   userID       3008 non-null   object
 6   userCountry  659 non-null    object
 7   userRegion   657 non-null    object
dtypes: object(8)
memory usage: 188.1+ KB


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

article_info['Title'] = article_info['Title'].apply(lambda x: (' ').join(x))
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
title_mat = count_vect.fit_transform(article_info['Title'])
print(title_mat.shape)

(3008, 22334)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

title_sim = cosine_similarity(title_mat, title_mat)
# 유사도 정보를 가지고 있음. 행별 장르 유사도 값
print(title_sim.shape)
print(title_sim[:1])

(3008, 3008)
[[1. 0. 0. ... 0. 0. 0.]]


In [None]:
title_sim_sorted_ind = title_sim.argsort()[:,::-1]
print(title_sim_sorted_ind[:1]) #유사도 높은 순서대로 레코드 순서

[[   0 1070  784 ... 1989 1988 1503]]


In [None]:
print(title_sim_sorted_ind[:1,:5])

[[   0 1070  784  808  644]]


In [None]:
title_sim_sorted_ind = title_sim_sorted_ind[:,:5]

In [None]:
scores=[5,4,3,2,1]

columns = ['Article_Index', 'Similar_Article_Index', 'Score']
score_df = pd.DataFrame(columns=columns)

for idx, similar_indices in enumerate(title_sim_sorted_ind):
    temp_df = pd.DataFrame({
        'Article_Index': [idx] * len(similar_indices),
        'Similar_Article_Index': similar_indices,
        'Score': scores
    })
    score_df = pd.concat([score_df, temp_df], ignore_index=True)

In [None]:
score_df # 각 아이템 기준으로 가까운 1,2,3,4,5 순위를 매겨 score 데이터 프레임 완성

Unnamed: 0,Article_Index,Similar_Article_Index,Score
0,0,0,5
1,0,1070,4
2,0,784,3
3,0,808,2
4,0,644,1
...,...,...,...
15035,3007,3007,5
15036,3007,862,4
15037,3007,683,3
15038,3007,2329,2


## 각 아이템 기준으로 가까운 1,2,3,4,5 순위를 매겨 score 데이터 프레임 완성
## 이후 진행: 각 유저별로 읽은 기사끼리 모아서 score 합산-> 높은 점수 가진 5개 기사 추천 (일단 groupby userID를 사용하는 건 맞을 것 같음)

In [None]:
score_df.to_csv('score_df.csv', index=False)

In [None]:
article_info.set_index('articleID', inplace=True)

In [None]:
user_recommendations = {}
user_groups = view_log_train.groupby('userID')

for user, group in user_groups:
    user_scores = pd.Series(dtype=float)

    for article in group['articleID']:
        article_scores = score_df[score_df['Article_Index'] == article]
        for _, row in article_scores.iterrows():
            if row['Similar_Article_Index'] in user_scores:
                user_scores[row['Similar_Article_Index']] += row['Score']
            else:
                user_scores[row['Similar_Article_Index']] = row['Score']

    # 이미 읽은 기사는 제외
    #user_scores = user_scores.drop(group['articleID'].values, errors='ignore')

    # 높은 점수를 가진 5개의 기사 추천
    top_recommendations = user_scores.nlargest(5).index.tolist()
    user_recommendations[user] = top_recommendations

In [None]:
score_df[score_df['Article_Index'] == article].sum()

Article_Index            0
Similar_Article_Index    0
Score                    0
dtype: object

In [None]:
user_groups

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ed6d700f3a0>

In [None]:
user_groups.head()

Unnamed: 0,userID,articleID,userRegion,userCountry
0,USER_0000,ARTICLE_0661,NY,US
1,USER_0000,ARTICLE_2316,NY,US
2,USER_0000,ARTICLE_1345,NY,US
3,USER_0000,ARTICLE_1089,NY,US
4,USER_0000,ARTICLE_1484,NY,US
...,...,...,...,...
42657,USER_1420,ARTICLE_2779,SP,BR
42658,USER_1420,ARTICLE_0614,SP,BR
42659,USER_1420,ARTICLE_2122,SP,BR
42660,USER_1420,ARTICLE_0456,SP,BR


In [None]:
user_scores # user_scores 생성에 문제

Series([], dtype: float64)