based on the following notebooks:
https://www.kaggle.com/code/eunicemok/recommendation-engine-that-combats-polarization

In the notebook two datasets were used:
1. CI&T whichi includes for each user with articles he interacted with - 'BOOKMARK', 'COMMENT CREATED', 'FOLLOW', 'LIKE', 'VIEW'
2. All the News - which includes information about difference news articles and their source.

In the notebook they used the source of the article (the website it was publised on) to determain the political view of the article (left, neutral, right). Then a model was trained (TD-IDF) based on the content of the articles in order to learn to classify the view of each article.
Using this model, we can classify the views of all the articles the user interacted with and that way see what his general views are.

In [None]:
import pandas as pd
from zipfile import ZipFile
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

### Load Data:

In [None]:
# update releavent path
path = 'C:/Users/gony/Downloads/CI&T.zip'

In [None]:
dfs = {}
with ZipFile(path, "r") as f:
    for name in f.namelist():
        with f.open(name) as zd:
            print(name)
            df = pd.read_csv(zd, encoding='utf8')
            dfs[name] = df
users_intrcs = dfs['users_interactions.csv']
shared_articles = dfs['shared_articles.csv']

In [None]:
# update releavent path
path = 'C:/Users/gony/Downloads/All_the_News.zip'

In [None]:
dfs = pd.DataFrame()
with ZipFile(path, "r") as f:
    for name in f.namelist():
#         if name == 'articles2.csv':
#             continue
        with f.open(name) as zd:
            print(name)
            df = pd.read_csv(zd, index_col=False)
            dfs = pd.concat([dfs, df])
            del [df]
# articles = pd.concat([dfs['articles1.csv'], dfs['articles3.csv']], ignore_index=True)

In [None]:
# update releavent path
path = 'C:/Users/gony/Downloads/MINDsmall_train.zip'

dfs = {}
with ZipFile(path, "r") as f:
    for name in f.namelist():
        if name.endswith('.tsv'):
            with f.open(name) as zd:
                if name == 'news.tsv':
                    columns = ['News ID',
                                "Category",
                                "SubCategory",
                                "Title",
                                "Abstract",
                                "URL",
                                "Title Entities",
                                "Abstract Entities "]
                elif name == 'behaviors.tsv':
                    columns = ['Impression ID',
                              'User ID',
                              'Time',
                              'History',
                              'impressions']
                print(name)
                df = pd.read_csv(zd, encoding='utf8', sep = '\t', 
                                 header=None, names=columns)
                dfs[name] = df
behaviors = dfs["behaviors.tsv"]
news = dfs['news.tsv']

In [None]:
news.head()

In [None]:
news['Abstract'].values[0]

In [None]:
users_intrcs.head()

In [None]:
users_intrcs.info()

In [None]:
np.unique(users_intrcs['eventType'].values)

In [None]:
shared_articles.head()

In [None]:
shared_articles.info()

In [None]:
np.unique(shared_articles['eventType'].values)

In [None]:
articles.head()

In [None]:
articles.info()

## Analysis:

In [None]:
articles = articles.rename(columns = {'Unnamed: 0': 'number'})


In [None]:
for column in articles:
    Y = articles['publication']
Y.head()

In [None]:
np.unique(Y.values)

Categorizing article based on the publisher:

In [None]:
for ind in range(len(Y)-1):
  if Y[ind] =='New York Times' or Y[ind] == 'CNN' or Y[ind] == 'Buzzfeed News' or Y[ind] =='Business Insider' or Y[ind] =='the Atlantic' or Y[ind] =='Talking Points Memo' or Y[ind] =='the Guardian' or Y[ind] == 'Vox' or Y[ind] =='Washington Post':
    articles.at[ind,'Lean'] = 'Left'
    

  elif Y[ind] == 'NPR' or Y[ind] == 'Reuters':
    articles.at[ind,'Lean'] = 'Neutral'

  else:
    articles.at[ind,'Lean'] = 'Right'

In [None]:
target = articles.groupby(["Lean"]).count()
percent_target = (target/len(articles))*100
percent_target

In [None]:
articles.groupby(["Lean"]).Lean.count().plot.bar(ylim = 0)
plt.show

Training model to clasify the lean:

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn
from sklearn.multiclass import OneVsRestClassifier   #1vs1 & 1vsRest Classifiers
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import gc

In [None]:
# take random sample of articles
sample = articles.sample(n= 10000, random_state = 1)
y = sample['Lean']
X = sample['content']

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
df_tfIdf = vectorizer.fit_transform(X.values)
# print(vectorizer.get_feature_names()[:10])
  
X_train, X_val, y_train, y_val = train_test_split(df_tfIdf, y, test_size=0.10, random_state=1, shuffle=True)
X_train.shape, X_val.shape, y_train.shape,y_val.shape

model = PassiveAggressiveClassifier(max_iter=10000, random_state=1,tol=1e-3).fit(X_train, y_train)
y_pred = model.predict(X_val)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_val,y_pred)
cm_display = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ['Left', 'Neutral', 'Right'])

cm_display.plot()
plt.show()

In [None]:
print('accuracy of model: ', sum(y_val == y_pred)/len(y_val))

In [None]:
x_test = shared_articles['text']
y_test = model.predict(vectorizer.transform(x_test.values))

In [None]:
shared_articles['Lean'] = y_test

In [None]:
shared_articles.groupby(["Lean"]).Lean.count().plot.bar(ylim = 0)
plt.show

In [None]:
df = users_intrcs.copy()

Can someone check my merge logic? (always confuses me)

In [None]:
df = pd.merge(df, shared_articles, left_on='contentId', right_on='contentId', how='inner')

In [None]:
groupby_user = df.groupby('personId', as_index=False).agg({'Lean':lambda x: list(x), 'eventType_x':lambda x: list(x)})

In [None]:
groupby_user.head()

In [None]:
temp = groupby_user['Lean']


In [None]:
for i in range(len(temp)):
    leans = temp[i]
    left, right, neutral = 0, 0, 0
    for x in leans:
        if x == 'Left':
            left += 1
        elif x == 'Right':
            right += 1
        else:
            neutral += 1
    groupby_user.at[i, 'Left'] = left/len(leans)
    groupby_user.at[i, 'Neutral'] = neutral/len(leans)
    groupby_user.at[i, 'Right'] = right/len(leans)


In [None]:
groupby_user.head()

In [None]:
plt.hist(groupby_user.iloc[0].Lean)
plt.show()

In [None]:
import seaborn as sns
lean_sample = groupby_user.sample(n= 30, random_state = 1)


In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(lean_sample[['Right', 'Neutral', 'Left']], annot=True)
plt.show()

In [None]:
left, right, neutral = [], [], []
for i, row in groupby_user.iterrows():
    if row['Left'] >=0.6:
        left.append(row['Left'])
    elif row['Right'] >=0.6:
        right.append(row['Right'])
    else:
        neutral.append(row['Neutral'])

In [None]:
len(left), len(right), len(neutral)

In [None]:
plt.hist(left)
plt.title('Histogram of users who lean to Left articles')
plt.show()
plt.hist(right)
plt.title('Histogram of users who lean to Right articles')
plt.show()
plt.hist(neutral)
plt.title('Histogram of users who lean to Neutral articles')
plt.show()