<a href="https://colab.research.google.com/github/KJM94/Single_project/blob/main/%EC%9B%B9%20%EA%B8%B0%EC%82%AC%20%EC%B6%94%EC%B2%9C%20AI%20%EA%B2%BD%EC%A7%84%EB%8C%80%ED%9A%8C/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
article_info = pd.read_csv('./article_info.csv')
view_log = pd.read_csv('./view_log.csv')

# Combine title and content for TF-IDF vectorization
article_info['text'] = article_info['Title'] + " " + article_info['Content']

# Compute the TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(article_info['text'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a mapping of articleID to index
article_indices = pd.Series(article_info.index, index=article_info['articleID']).drop_duplicates()

# Function to get article recommendations based on content similarity
def get_content_recommendations(article_id, num_recommendations=5):
    idx = article_indices.get(article_id, None)
    if idx is None:
        return []
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations + 1]
    article_indices_recommended = [i[0] for i in sim_scores]
    return article_info['articleID'].iloc[article_indices_recommended].values.tolist()

# Generate user-article interaction matrix
user_article_matrix = view_log.pivot_table(index='userID', columns='articleID', aggfunc='size', fill_value=0)

# Compute user similarity matrix using collaborative filtering
user_similarity = cosine_similarity(user_article_matrix)

# Generate content-based recommendations
content_recommendations = []
for user in view_log['userID'].unique():
    viewed_articles = view_log[view_log['userID'] == user]['articleID'].tolist()
    recommendations = []
    for article in viewed_articles:
        recommendations.extend(get_content_recommendations(article, num_recommendations=1))
    recommendations = list(set(recommendations))[:5]
    for rec in recommendations:
        content_recommendations.append([user, rec])

# Convert content-based recommendations to DataFrame
content_recommendations_df = pd.DataFrame(content_recommendations, columns=['userID', 'articleID'])

# Hybrid approach: combining collaborative filtering with content-based filtering
def hybrid_recommendation(user_id, num_recommendations=5):
    # Collaborative filtering recommendations
    user_idx = user_article_matrix.index.get_loc(user_id)
    cf_scores = user_similarity[user_idx].dot(user_article_matrix) / np.array([np.abs(user_similarity[user_idx]).sum()])
    cf_recommendations = np.argsort(cf_scores)[::-1][:num_recommendations]

    # Content-based filtering recommendations
    cb_recommendations = []
    viewed_articles = view_log[view_log['userID'] == user_id]['articleID'].tolist()
    for article in viewed_articles:
        cb_recommendations.extend(get_content_recommendations(article, num_recommendations=1))
    cb_recommendations = list(set(cb_recommendations))[:num_recommendations]

    # Combine recommendations
    recommendations = list(set(cf_recommendations) | set(cb_recommendations))[:num_recommendations]
    return recommendations

# Generate hybrid recommendations for all users
hybrid_recommendations = []
for user in view_log['userID'].unique():
    recommendations = hybrid_recommendation(user, num_recommendations=5)
    for rec in recommendations:
        hybrid_recommendations.append([user, rec])

# Convert hybrid recommendations to DataFrame
hybrid_recommendations_df = pd.DataFrame(hybrid_recommendations, columns=['userID', 'articleID'])

# Save the hybrid recommendations to a CSV file
hybrid_recommendations_df.to_csv('./hybrid_recommendations.csv', index=False)

# Display the first few rows of the recommendations
print(hybrid_recommendations_df.head())


      userID     articleID
0  USER_0000  ARTICLE_2806
1  USER_0000           390
2  USER_0000  ARTICLE_1053
3  USER_0000          2156
4  USER_0000  ARTICLE_2642


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
article_info = pd.read_csv('./article_info.csv')
view_log = pd.read_csv('./view_log.csv')

# Combine title and content for TF-IDF vectorization
article_info['text'] = article_info['Title'] + " " + article_info['Content']

# Compute the TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(article_info['text'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a mapping of articleID to index
article_indices = pd.Series(article_info.index, index=article_info['articleID']).drop_duplicates()

# Function to get article recommendations based on content similarity
def get_content_recommendations(article_id, num_recommendations=5):
    idx = article_indices.get(article_id, None)
    if idx is None:
        return []
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations + 1]
    article_indices_recommended = [i[0] for i in sim_scores]
    return article_info['articleID'].iloc[article_indices_recommended].values.tolist()

# Generate user-article interaction matrix
user_article_matrix = view_log.pivot_table(index='userID', columns='articleID', aggfunc='size', fill_value=0)

# Compute user similarity matrix using collaborative filtering
user_similarity = cosine_similarity(user_article_matrix)

# Hybrid approach: combining collaborative filtering with content-based filtering
def hybrid_recommendation(user_id, num_recommendations=5, weight_cf=0.5, weight_cb=0.5):
    # Collaborative filtering recommendations
    user_idx = user_article_matrix.index.get_loc(user_id)
    cf_scores = user_similarity[user_idx].dot(user_article_matrix) / np.array([np.abs(user_similarity[user_idx]).sum()])
    cf_recommendations = np.argsort(cf_scores)[::-1]

    # Content-based filtering recommendations
    cb_recommendations = []
    viewed_articles = view_log[view_log['userID'] == user_id]['articleID'].tolist()
    for article in viewed_articles:
        cb_recommendations.extend(get_content_recommendations(article, num_recommendations=1))
    cb_recommendations = list(set(cb_recommendations))

    # Combine and rank recommendations
    combined_scores = {}
    for article in cf_recommendations[:num_recommendations * 2]:
        combined_scores[article] = combined_scores.get(article, 0) + weight_cf * cf_scores[article]
    for article in cb_recommendations:
        idx = article_indices.get(article, None)
        if idx is not None:
            combined_scores[article] = combined_scores.get(article, 0) + weight_cb * cosine_sim[article_indices[viewed_articles[0]]][idx]

    sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    final_recommendations = [article for article, score in sorted_recommendations if article not in viewed_articles][:num_recommendations]

    return final_recommendations

# Generate hybrid recommendations for all users
hybrid_recommendations = []
for user in view_log['userID'].unique():
    recommendations = hybrid_recommendation(user, num_recommendations=5)
    for rec in recommendations:
        hybrid_recommendations.append([user, rec])

# Convert hybrid recommendations to DataFrame
hybrid_recommendations_df = pd.DataFrame(hybrid_recommendations, columns=['userID', 'articleID'])

# Save the hybrid recommendations to a CSV file
hybrid_recommendations_df.to_csv('./hybrid_recommendations_optimized.csv', index=False)

# Display the first few rows of the recommendations
print(hybrid_recommendations_df.head())


      userID     articleID
0  USER_0000           390
1  USER_0000  ARTICLE_1052
2  USER_0000           635
3  USER_0000          1498
4  USER_0000          1176


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
article_info = pd.read_csv('./article_info.csv')
view_log = pd.read_csv('./view_log.csv')

# Combine title and content for TF-IDF vectorization
article_info['text'] = article_info['Title'] + " " + article_info['Content']

# Compute the TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(article_info['text'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a mapping of articleID to index
article_indices = pd.Series(article_info.index, index=article_info['articleID']).drop_duplicates()

# Function to get article recommendations based on content similarity
def get_content_recommendations(article_id, num_recommendations=5):
    idx = article_indices.get(article_id, None)
    if idx is None:
        return []
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations + 1]
    article_indices_recommended = [i[0] for i in sim_scores]
    return article_info['articleID'].iloc[article_indices_recommended].values.tolist()

# Generate user-article interaction matrix
user_article_matrix = view_log.pivot_table(index='userID', columns='articleID', aggfunc='size', fill_value=0)

# Compute user similarity matrix using collaborative filtering
user_similarity = cosine_similarity(user_article_matrix)

# Hybrid approach: combining collaborative filtering with content-based filtering
def hybrid_recommendation(user_id, num_recommendations=5, weight_cf=0.7, weight_cb=0.3, top_n_similar_users=10):
    # Collaborative filtering recommendations
    user_idx = user_article_matrix.index.get_loc(user_id)
    user_sim_scores = user_similarity[user_idx]

    # Consider only top N similar users
    top_users_idx = np.argsort(user_sim_scores)[::-1][:top_n_similar_users]
    cf_scores = user_sim_scores[top_users_idx].dot(user_article_matrix.iloc[top_users_idx]) / np.array([np.abs(user_sim_scores[top_users_idx]).sum()])
    cf_recommendations = np.argsort(cf_scores)[::-1]

    # Content-based filtering recommendations
    cb_recommendations = []
    viewed_articles = view_log[view_log['userID'] == user_id]['articleID'].tolist()
    for article in viewed_articles:
        cb_recommendations.extend(get_content_recommendations(article, num_recommendations=2))  # Increase number of recommendations
    cb_recommendations = list(set(cb_recommendations))

    # Combine and rank recommendations
    combined_scores = {}
    for article in cf_recommendations[:num_recommendations * 2]:
        combined_scores[article] = combined_scores.get(article, 0) + weight_cf * cf_scores[article]
    for article in cb_recommendations:
        idx = article_indices.get(article, None)
        if idx is not None:
            combined_scores[article] = combined_scores.get(article, 0) + weight_cb * cosine_sim[article_indices[viewed_articles[0]]][idx]

    sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    final_recommendations = [article for article, score in sorted_recommendations if article not in viewed_articles][:num_recommendations]

    return final_recommendations

# Generate hybrid recommendations for all users
hybrid_recommendations = []
for user in view_log['userID'].unique():
    recommendations = hybrid_recommendation(user, num_recommendations=5)
    for rec in recommendations:
        hybrid_recommendations.append([user, rec])

# Convert hybrid recommendations to DataFrame
hybrid_recommendations_df = pd.DataFrame(hybrid_recommendations, columns=['userID', 'articleID'])

# Save the hybrid recommendations to a CSV file
hybrid_recommendations_df.to_csv('./hybrid_recommendations_tuned.csv', index=False)

# Display the first few rows of the recommendations
print(hybrid_recommendations_df.head())


      userID articleID
0  USER_0000       390
1  USER_0000      2156
2  USER_0000      2713
3  USER_0000      2212
4  USER_0000      1282


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD

# 데이터 로드
article_info = pd.read_csv('./article_info.csv')
view_log = pd.read_csv('./view_log.csv')

# Combine title and content for TF-IDF vectorization
article_info['text'] = article_info['Title'] + " " + article_info['Content']

# Compute the TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(article_info['text'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a mapping of articleID to index
article_indices = pd.Series(article_info.index, index=article_info['articleID']).drop_duplicates()

# Function to get article recommendations based on content similarity
def get_content_recommendations(article_id, num_recommendations=5):
    idx = article_indices.get(article_id, None)
    if idx is None:
        return []
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations + 1]
    article_indices_recommended = [i[0] for i in sim_scores]
    return article_info['articleID'].iloc[article_indices_recommended].values.tolist()

# Generate user-article interaction matrix
user_article_matrix = view_log.pivot_table(index='userID', columns='articleID', aggfunc='size', fill_value=0)

# Compute user similarity matrix using collaborative filtering
user_similarity = cosine_similarity(user_article_matrix)

# Hybrid approach: combining collaborative filtering with content-based filtering
def hybrid_recommendation(user_id, num_recommendations=5, weight_cf=0.7, weight_cb=0.3, top_n_similar_users=10):
    # Collaborative filtering recommendations
    user_idx = user_article_matrix.index.get_loc(user_id)
    user_sim_scores = user_similarity[user_idx]

    # Consider only top N similar users
    top_users_idx = np.argsort(user_sim_scores)[::-1][:top_n_similar_users]
    cf_scores = user_sim_scores[top_users_idx].dot(user_article_matrix.iloc[top_users_idx]) / np.array([np.abs(user_sim_scores[top_users_idx]).sum()])
    cf_recommendations = np.argsort(cf_scores)[::-1]

    # Content-based filtering recommendations
    cb_recommendations = []
    viewed_articles = view_log[view_log['userID'] == user_id]['articleID'].tolist()
    for article in viewed_articles:
        cb_recommendations.extend(get_content_recommendations(article, num_recommendations=2))  # Increase number of recommendations
    cb_recommendations = list(set(cb_recommendations))

    # Combine and rank recommendations
    combined_scores = {}
    for article in cf_recommendations[:num_recommendations * 2]:
        combined_scores[article] = combined_scores.get(article, 0) + weight_cf * cf_scores[article]
    for article in cb_recommendations:
        idx = article_indices.get(article, None)
        if idx is not None:
            combined_scores[article] = combined_scores.get(article, 0) + weight_cb * cosine_sim[article_indices[viewed_articles[0]]][idx]

    sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    final_recommendations = [article for article, score in sorted_recommendations if article not in viewed_articles][:num_recommendations]

    return final_recommendations

# Generate hybrid recommendations for all users
hybrid_recommendations = []
for user in view_log['userID'].unique():
    recommendations = hybrid_recommendation(user, num_recommendations=5)
    for rec in recommendations:
        hybrid_recommendations.append([user, rec])

# Convert hybrid recommendations to DataFrame
hybrid_recommendations_df = pd.DataFrame(hybrid_recommendations, columns=['userID', 'articleID'])

# Save the hybrid recommendations to a CSV file
hybrid_recommendations_df.to_csv('./hybrid_recommendations_optimized.csv', index=False)

# Display the first few rows of the recommendations
print(hybrid_recommendations_df.head())


      userID articleID
0  USER_0000       390
1  USER_0000      2156
2  USER_0000      2713
3  USER_0000      2212
4  USER_0000      1282


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds

# 데이터 로드
article_info = pd.read_csv('./article_info.csv')
view_log = pd.read_csv('./view_log.csv')

# Combine title and content for TF-IDF vectorization
article_info['text'] = article_info['Title'] + " " + article_info['Content']

# Compute the TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(article_info['text'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a mapping of articleID to index
article_indices = pd.Series(article_info.index, index=article_info['articleID']).drop_duplicates()

# Function to get article recommendations based on content similarity
def get_content_recommendations(article_id, num_recommendations=5):
    idx = article_indices.get(article_id, None)
    if idx is None:
        return []
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations + 1]
    article_indices_recommended = [i[0] for i in sim_scores]
    return article_info['articleID'].iloc[article_indices_recommended].values.tolist()

# Generate user-article interaction matrix
user_article_matrix = view_log.pivot_table(index='userID', columns='articleID', aggfunc='size', fill_value=0)

# Normalize the matrix
user_article_matrix_norm = user_article_matrix.subtract(user_article_matrix.mean(axis=1), axis=0)

# Perform SVD
U, sigma, Vt = svds(user_article_matrix_norm, k=50)
sigma = np.diag(sigma)

# Calculate predicted ratings
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_article_matrix.mean(axis=1).values.reshape(-1, 1)
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=user_article_matrix.columns, index=user_article_matrix.index)

# Hybrid recommendation function
def hybrid_recommendation(user_id, num_recommendations=5, weight_cf=0.7, weight_cb=0.3, top_n_similar_users=10):
    # Collaborative filtering recommendations
    cf_recommendations = predicted_ratings_df.loc[user_id].sort_values(ascending=False).index.tolist()

    # Content-based filtering recommendations
    cb_recommendations = []
    viewed_articles = view_log[view_log['userID'] == user_id]['articleID'].tolist()
    for article in viewed_articles:
        cb_recommendations.extend(get_content_recommendations(article, num_recommendations=2))
    cb_recommendations = list(set(cb_recommendations))

    # Combine and rank recommendations
    combined_scores = {}
    for article in cf_recommendations[:num_recommendations * 2]:
        combined_scores[article] = combined_scores.get(article, 0) + weight_cf
    for article in cb_recommendations:
        combined_scores[article] = combined_scores.get(article, 0) + weight_cb

    sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    final_recommendations = [article for article, score in sorted_recommendations if article not in viewed_articles][:num_recommendations]

    return final_recommendations

# Generate hybrid recommendations for all users
hybrid_recommendations = []
for user in view_log['userID'].unique():
    recommendations = hybrid_recommendation(user, num_recommendations=5)
    for rec in recommendations:
        hybrid_recommendations.append([user, rec])

# Convert hybrid recommendations to DataFrame
hybrid_recommendations_df = pd.DataFrame(hybrid_recommendations, columns=['userID', 'articleID'])

# Save the hybrid recommendations to a CSV file
hybrid_recommendations_df.to_csv('./hybrid_recommendations_optimized.csv', index=False)

# Display the first few rows of the recommendations
print(hybrid_recommendations_df.head())


      userID     articleID
0  USER_0000  ARTICLE_1568
1  USER_0000  ARTICLE_2720
2  USER_0000  ARTICLE_1948
3  USER_0000  ARTICLE_2147
4  USER_0000  ARTICLE_0561


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds

# 데이터 로드
article_info = pd.read_csv('./article_info.csv')
view_log = pd.read_csv('./view_log.csv')

# Combine title and content for TF-IDF vectorization
article_info['text'] = article_info['Title'] + " " + article_info['Content']

# Compute the TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(article_info['text'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a mapping of articleID to index
article_indices = pd.Series(article_info.index, index=article_info['articleID']).drop_duplicates()

# Function to get article recommendations based on content similarity
def get_content_recommendations(article_id, num_recommendations=5):
    idx = article_indices.get(article_id, None)
    if idx is None:
        return []
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations + 1]
    article_indices_recommended = [i[0] for i in sim_scores]
    return article_info['articleID'].iloc[article_indices_recommended].values.tolist()

# Generate user-article interaction matrix
user_article_matrix = view_log.pivot_table(index='userID', columns='articleID', aggfunc='size', fill_value=0)

# Normalize the matrix
user_article_matrix_norm = user_article_matrix.subtract(user_article_matrix.mean(axis=1), axis=0)

# Perform SVD
U, sigma, Vt = svds(user_article_matrix_norm, k=50)
sigma = np.diag(sigma)

# Calculate predicted ratings
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_article_matrix.mean(axis=1).values.reshape(-1, 1)
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=user_article_matrix.columns, index=user_article_matrix.index)

# Hybrid recommendation function
def hybrid_recommendation(user_id, num_recommendations=5, weight_cf=0.7, weight_cb=0.3, top_n_similar_users=10):
    # Collaborative filtering recommendations
    cf_recommendations = predicted_ratings_df.loc[user_id].sort_values(ascending=False).index.tolist()

    # Content-based filtering recommendations
    cb_recommendations = []
    viewed_articles = view_log[view_log['userID'] == user_id]['articleID'].tolist()
    for article in viewed_articles:
        cb_recommendations.extend(get_content_recommendations(article, num_recommendations=2))
    cb_recommendations = list(set(cb_recommendations))

    # Combine and rank recommendations
    combined_scores = {}
    for article in cf_recommendations[:num_recommendations * 2]:
        combined_scores[article] = combined_scores.get(article, 0) + weight_cf
    for article in cb_recommendations:
        combined_scores[article] = combined_scores.get(article, 0) + weight_cb

    sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    final_recommendations = [article for article, score in sorted_recommendations if article not in viewed_articles][:num_recommendations]

    return final_recommendations

# Generate hybrid recommendations for all users
hybrid_recommendations = []
for user in view_log['userID'].unique():
    recommendations = hybrid_recommendation(user, num_recommendations=5)
    for rec in recommendations:
        hybrid_recommendations.append([user, rec])

# Convert hybrid recommendations to DataFrame
hybrid_recommendations_df = pd.DataFrame(hybrid_recommendations, columns=['userID', 'articleID'])

# Save the hybrid recommendations to a CSV file
hybrid_recommendations_df.to_csv('./hybrid_recommendations_optimized.csv', index=False)

# Display the first few rows of the recommendations
print(hybrid_recommendations_df.head())


      userID     articleID
0  USER_0000  ARTICLE_1568
1  USER_0000  ARTICLE_2720
2  USER_0000  ARTICLE_1948
3  USER_0000  ARTICLE_2147
4  USER_0000  ARTICLE_0561


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds

# 데이터 로드
article_info = pd.read_csv('./article_info.csv')
view_log = pd.read_csv('./view_log.csv')

# Combine title and content for TF-IDF vectorization
article_info['text'] = article_info['Title'] + " " + article_info['Content']

# Compute the TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(article_info['text'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a mapping of articleID to index
article_indices = pd.Series(article_info.index, index=article_info['articleID']).drop_duplicates()

# Function to get article recommendations based on content similarity
def get_content_recommendations(article_id, num_recommendations=5):
    idx = article_indices.get(article_id, None)
    if idx is None:
        return []
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations + 1]
    article_indices_recommended = [i[0] for i in sim_scores]
    return article_info['articleID'].iloc[article_indices_recommended].values.tolist()

# Generate user-article interaction matrix
user_article_matrix = view_log.pivot_table(index='userID', columns='articleID', aggfunc='size', fill_value=0)

# Normalize the matrix
user_article_matrix_norm = user_article_matrix.subtract(user_article_matrix.mean(axis=1), axis=0)

# Perform SVD
U, sigma, Vt = svds(user_article_matrix_norm, k=50)
sigma = np.diag(sigma)

# Calculate predicted ratings
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_article_matrix.mean(axis=1).values.reshape(-1, 1)
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=user_article_matrix.columns, index=user_article_matrix.index)

# Function to get hybrid recommendations
def hybrid_recommendation(user_id, num_recommendations=5, weight_cf=0.7, weight_cb=0.3):
    # Collaborative filtering recommendations
    cf_recommendations = predicted_ratings_df.loc[user_id].sort_values(ascending=False).index.tolist()

    # Content-based filtering recommendations
    cb_recommendations = []
    viewed_articles = view_log[view_log['userID'] == user_id]['articleID'].tolist()
    for article in viewed_articles:
        cb_recommendations.extend(get_content_recommendations(article, num_recommendations=2))
    cb_recommendations = list(set(cb_recommendations))

    # Combine and rank recommendations
    combined_scores = {}
    for article in cf_recommendations[:num_recommendations * 2]:
        combined_scores[article] = combined_scores.get(article, 0) + weight_cf
    for article in cb_recommendations:
        combined_scores[article] = combined_scores.get(article, 0) + weight_cb

    sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    final_recommendations = [article for article, score in sorted_recommendations if article not in viewed_articles][:num_recommendations]

    return final_recommendations

# Generate hybrid recommendations for all users
hybrid_recommendations = []
for user in view_log['userID'].unique():
    recommendations = hybrid_recommendation(user, num_recommendations=5)
    for rec in recommendations:
        hybrid_recommendations.append([user, rec])

# Convert hybrid recommendations to DataFrame
hybrid_recommendations_df = pd.DataFrame(hybrid_recommendations, columns=['userID', 'articleID'])

# Save the hybrid recommendations to a CSV file
hybrid_recommendations_df.to_csv('./hybrid_recommendations_optimized.csv', index=False)

# Display the first few rows of the recommendations
print(hybrid_recommendations_df.head())


      userID     articleID
0  USER_0000  ARTICLE_1568
1  USER_0000  ARTICLE_2720
2  USER_0000  ARTICLE_1948
3  USER_0000  ARTICLE_2147
4  USER_0000  ARTICLE_0561


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse.linalg import svds

# 데이터 로드
article_info = pd.read_csv('./article_info.csv')
view_log = pd.read_csv('./view_log.csv')

# TF-IDF 벡터화
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(article_info['Content'])

# 기사 ID를 인덱스와 매핑
indices = pd.Series(article_info.index, index=article_info['articleID']).drop_duplicates()

# 코사인 유사도 계산
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# 콘텐츠 기반 추천
def get_content_based_recommendations(article_id, num_recommendations=5):
    idx = indices[article_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    article_indices = [i[0] for i in sim_scores]
    return article_info['articleID'].iloc[article_indices]

# 사용자-기사 상호작용 매트릭스 생성
interaction_matrix = view_log.pivot_table(index='userID', columns='articleID', aggfunc='size', fill_value=0)

# 상호작용 매트릭스를 실수형으로 변환
interaction_matrix = interaction_matrix.astype(np.float64)

# SVD 수행
U, sigma, Vt = svds(interaction_matrix, k=50)
sigma = np.diag(sigma)

# 예측 평점 계산
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings = pd.DataFrame(predicted_ratings, columns=interaction_matrix.columns)

# 협업 필터링 기반 추천
def collaborative_filtering(user_id, num_recommendations=5):
    user_idx = interaction_matrix.index.get_loc(user_id)
    sorted_user_predictions = predicted_ratings.iloc[user_idx].sort_values(ascending=False)
    return sorted_user_predictions.index[:num_recommendations]

# 하이브리드 추천 시스템
def hybrid_recommendation_system(user_id, num_recommendations=5, weight_cf=0.7, weight_cb=0.3):
    cf_recommendations = collaborative_filtering(user_id, num_recommendations * 2)
    viewed_articles = view_log[view_log['userID'] == user_id]['articleID'].tolist()
    cb_recommendations = []
    for article_id in viewed_articles:
        cb_recommendations.extend(get_content_based_recommendations(article_id, num_recommendations=2))
    combined_recommendations = list(set(cf_recommendations).union(set(cb_recommendations)))
    combined_scores = {article: weight_cf for article in cf_recommendations}
    for article in cb_recommendations:
        if article in combined_scores:
            combined_scores[article] += weight_cb
        else:
            combined_scores[article] = weight_cb
    sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    return [article for article, score in sorted_recommendations if article not in viewed_articles][:num_recommendations]

# 추천 결과 생성
results = []
for user_id in view_log['userID'].unique():
    recommendations = hybrid_recommendation_system(user_id)
    for article_id in recommendations:
        results.append([user_id, article_id])

# 결과 저장
results_df = pd.DataFrame(results, columns=['userID', 'articleID'])
results_df.to_csv('./hybrid_recommendations_optimized.csv', index=False)

# 결과 출력
print(results_df.head())


      userID     articleID
0  USER_0000  ARTICLE_1305
1  USER_0000  ARTICLE_0084
2  USER_0000  ARTICLE_2081
3  USER_0000  ARTICLE_2449
4  USER_0000  ARTICLE_1568


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler

# 데이터 로드
article_info = pd.read_csv('./article_info.csv')
view_log = pd.read_csv('./view_log.csv')

# TF-IDF 벡터화
tfidf = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=2, ngram_range=(1, 2))
tfidf_matrix = tfidf.fit_transform(article_info['Content'])

# 기사 ID를 인덱스와 매핑
indices = pd.Series(article_info.index, index=article_info['articleID']).drop_duplicates()

# 코사인 유사도 계산
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# 콘텐츠 기반 추천
def get_content_based_recommendations(article_id, num_recommendations=5):
    idx = indices[article_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    article_indices = [i[0] for i in sim_scores]
    return article_info['articleID'].iloc[article_indices]

# 사용자-기사 상호작용 매트릭스 생성
interaction_matrix = view_log.pivot_table(index='userID', columns='articleID', aggfunc='size', fill_value=0)

# 상호작용 매트릭스를 실수형으로 변환
interaction_matrix = interaction_matrix.astype(np.float64)

# SVD 수행 (차원 수 조정)
U, sigma, Vt = svds(interaction_matrix, k=100)
sigma = np.diag(sigma)

# 예측 평점 계산
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings = pd.DataFrame(predicted_ratings, columns=interaction_matrix.columns)

# 협업 필터링 기반 추천
def collaborative_filtering(user_id, num_recommendations=5):
    user_idx = interaction_matrix.index.get_loc(user_id)
    sorted_user_predictions = predicted_ratings.iloc[user_idx].sort_values(ascending=False)
    return sorted_user_predictions.index[:num_recommendations]

# 하이브리드 추천 시스템
def hybrid_recommendation_system(user_id, num_recommendations=5, weight_cf=0.7, weight_cb=0.3):
    cf_recommendations = collaborative_filtering(user_id, num_recommendations * 2)
    viewed_articles = view_log[view_log['userID'] == user_id]['articleID'].tolist()
    cb_recommendations = []
    for article_id in viewed_articles:
        cb_recommendations.extend(get_content_based_recommendations(article_id, num_recommendations=2))
    combined_recommendations = list(set(cf_recommendations).union(set(cb_recommendations)))
    combined_scores = {article: weight_cf for article in cf_recommendations}
    for article in cb_recommendations:
        if article in combined_scores:
            combined_scores[article] += weight_cb
        else:
            combined_scores[article] = weight_cb
    sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    return [article for article, score in sorted_recommendations if article not in viewed_articles][:num_recommendations]

# 추천 결과 생성
results = []
for user_id in view_log['userID'].unique():
    recommendations = hybrid_recommendation_system(user_id)
    for article_id in recommendations:
        results.append([user_id, article_id])

# 결과 저장
results_df = pd.DataFrame(results, columns=['userID', 'articleID'])
results_df.to_csv('./hybrid_recommendations_optimized.csv', index=False)

# 결과 출력
print(results_df.head())


      userID     articleID
0  USER_0000  ARTICLE_1305
1  USER_0000  ARTICLE_0084
2  USER_0000  ARTICLE_2081
3  USER_0000  ARTICLE_0830
4  USER_0000  ARTICLE_2147


In [None]:
import pandas as pd

# 데이터 로드
article_info = pd.read_csv('./article_info.csv')
view_log = pd.read_csv('./view_log.csv')

# 데이터 분석
print(article_info.info())
print(view_log.info())

# 기사 내용 길이 확인
article_info['content_length'] = article_info['Content'].apply(len)
print(article_info['content_length'].describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3008 entries, 0 to 3007
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   articleID    3008 non-null   object
 1   Title        3008 non-null   object
 2   Content      3008 non-null   object
 3   Format       3008 non-null   object
 4   Language     3008 non-null   object
 5   userID       3008 non-null   object
 6   userCountry  659 non-null    object
 7   userRegion   657 non-null    object
dtypes: object(8)
memory usage: 188.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42717 entries, 0 to 42716
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   userID       42717 non-null  object
 1   articleID    42717 non-null  object
 2   userRegion   42717 non-null  object
 3   userCountry  42717 non-null  object
dtypes: object(4)
memory usage: 1.3+ MB
None
count      3008.000000
mean       5428

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse.linalg import svds
import numpy as np
import pandas as pd

# 데이터 로드
article_info = pd.read_csv('./article_info.csv')
view_log = pd.read_csv('./view_log.csv')

# TF-IDF 벡터화
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7, min_df=3, ngram_range=(1, 3))
tfidf_matrix = tfidf.fit_transform(article_info['Content'])

# 기사 ID를 인덱스와 매핑
indices = pd.Series(article_info.index, index=article_info['articleID']).drop_duplicates()

# 코사인 유사도 계산
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# 콘텐츠 기반 추천
def get_content_based_recommendations(article_id, num_recommendations=5):
    idx = indices[article_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    article_indices = [i[0] for i in sim_scores]
    return article_info['articleID'].iloc[article_indices]

# 사용자-기사 상호작용 매트릭스 생성
interaction_matrix = view_log.pivot_table(index='userID', columns='articleID', aggfunc='size', fill_value=0)

# 상호작용 매트릭스를 실수형으로 변환
interaction_matrix = interaction_matrix.astype(np.float64)

# SVD 수행 (차원 수 조정)
U, sigma, Vt = svds(interaction_matrix, k=75)
sigma = np.diag(sigma)

# 예측 평점 계산
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings = pd.DataFrame(predicted_ratings, columns=interaction_matrix.columns)

# 협업 필터링 기반 추천
def collaborative_filtering(user_id, num_recommendations=5):
    user_idx = interaction_matrix.index.get_loc(user_id)
    sorted_user_predictions = predicted_ratings.iloc[user_idx].sort_values(ascending=False)
    return sorted_user_predictions.index[:num_recommendations]

# 하이브리드 추천 시스템
def hybrid_recommendation_system(user_id, num_recommendations=5, weight_cf=0.8, weight_cb=0.2):
    cf_recommendations = collaborative_filtering(user_id, num_recommendations * 2)
    viewed_articles = view_log[view_log['userID'] == user_id]['articleID'].tolist()
    cb_recommendations = []
    for article_id in viewed_articles:
        cb_recommendations.extend(get_content_based_recommendations(article_id, num_recommendations=2))
    combined_recommendations = list(set(cf_recommendations).union(set(cb_recommendations)))
    combined_scores = {article: weight_cf for article in cf_recommendations}
    for article in cb_recommendations:
        if article in combined_scores:
            combined_scores[article] += weight_cb
        else:
            combined_scores[article] = weight_cb
    sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    return [article for article, score in sorted_recommendations if article not in viewed_articles][:num_recommendations]

# 추천 결과 생성
results = []
for user_id in view_log['userID'].unique():
    recommendations = hybrid_recommendation_system(user_id)
    for article_id in recommendations:
        results.append([user_id, article_id])

# 결과 저장
results_df = pd.DataFrame(results, columns=['userID', 'articleID'])
results_df.to_csv('./hybrid_recommendations_optimized.csv', index=False)

# 결과 출력
print(results_df.head())


      userID     articleID
0  USER_0000  ARTICLE_1305
1  USER_0000  ARTICLE_2147
2  USER_0000  ARTICLE_1948
3  USER_0000  ARTICLE_1281
4  USER_0000  ARTICLE_2720
