### Recommender System
Let's clean up out tweet_topic_matrix_df, take a look at the topic vector for each outlet, and create our recommender system.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
outlet_df = pd.read_pickle('outlet_df.pickle')

In [4]:
outlet_df

Unnamed: 0,outlet,Trump,Pandemic_Impact,Business,Biden,Covid_Spread,China,US_Election,Work_From_Home,Stock_Market,...,Climate_Change,(Un)employeement,Technology,Hong_Kong,Self-Help,Covid_Vaccine,Personal_Finance,Parenting,Cars,Global_Lockdown
0,FinancialTimes,0.002097,0.00185,0.001701,0.001309,0.002443,0.004224,0.002391,0.000939,0.002885,...,0.001495,0.001479,0.002795,0.00311,0.001315,0.001446,0.001667,0.001297,0.001113,0.005063
1,Medium,0.000576,0.000609,0.000702,0.000304,0.000704,0.000459,0.000545,0.000492,0.000458,...,0.00128,0.000906,0.002952,0.000269,0.00178,0.000686,0.000923,0.002043,0.000786,0.001131
2,NewYorker,0.002584,0.000958,0.000254,0.001412,0.000647,0.000527,0.001215,0.000481,0.000298,...,0.001507,0.000523,0.001379,0.000493,0.001761,0.000581,0.000691,0.002175,0.000713,0.001488
3,TheAtlantic,0.001784,0.001,0.0003,0.001116,0.000762,0.000698,0.001106,0.000379,0.000281,...,0.001131,0.000543,0.001297,0.00069,0.001171,0.000489,0.000627,0.001904,0.000498,0.001406
4,TheEconomist,0.001077,0.001623,0.000568,0.001123,0.000796,0.002706,0.001767,0.000443,0.000914,...,0.001379,0.000729,0.001454,0.001177,0.000963,0.00093,0.001504,0.001232,0.000676,0.002167
5,WIRED,0.000638,0.000751,0.001426,0.000334,0.00095,0.000857,0.000835,0.000892,0.000662,...,0.002079,0.000769,0.007508,0.00056,0.001588,0.000978,0.001196,0.001739,0.002421,0.001907
6,WSJ,0.002055,0.003996,0.002182,0.002279,0.002427,0.002813,0.001742,0.001551,0.003374,...,0.001057,0.001865,0.003329,0.001776,0.001519,0.001769,0.001668,0.002131,0.001381,0.004342
7,businessinsider,0.001241,0.002518,0.002545,0.000997,0.000951,0.000993,0.001445,0.001326,0.004581,...,0.000858,0.002731,0.006354,0.000534,0.001667,0.001183,0.003529,0.001331,0.004414,0.001854
8,nytimes,0.00334,0.003165,0.000935,0.003486,0.00379,0.001922,0.002553,0.001423,0.000989,...,0.001909,0.001639,0.002481,0.001551,0.001541,0.001665,0.001366,0.003393,0.001265,0.005753
9,washingtonpost,0.004698,0.002791,0.00047,0.00311,0.002152,0.001385,0.002808,0.000885,0.000628,...,0.001283,0.001097,0.002068,0.001056,0.001048,0.001528,0.001055,0.002246,0.000695,0.003429


In [5]:
outlet_df['outlet'] = ['The Financial Times', 'Medium', 'The New Yorker', 
                       'The Atlantic', 'The Economist', 'Wired', 
                       'Wall Street Journal', 'Business Insider', 
                       'New York Times', 'The Washington Post']

In [6]:
outlet_df

Unnamed: 0,outlet,Trump,Pandemic_Impact,Business,Biden,Covid_Spread,China,US_Election,Work_From_Home,Stock_Market,...,Climate_Change,(Un)employeement,Technology,Hong_Kong,Self-Help,Covid_Vaccine,Personal_Finance,Parenting,Cars,Global_Lockdown
0,The Financial Times,0.002097,0.00185,0.001701,0.001309,0.002443,0.004224,0.002391,0.000939,0.002885,...,0.001495,0.001479,0.002795,0.00311,0.001315,0.001446,0.001667,0.001297,0.001113,0.005063
1,Medium,0.000576,0.000609,0.000702,0.000304,0.000704,0.000459,0.000545,0.000492,0.000458,...,0.00128,0.000906,0.002952,0.000269,0.00178,0.000686,0.000923,0.002043,0.000786,0.001131
2,The New Yorker,0.002584,0.000958,0.000254,0.001412,0.000647,0.000527,0.001215,0.000481,0.000298,...,0.001507,0.000523,0.001379,0.000493,0.001761,0.000581,0.000691,0.002175,0.000713,0.001488
3,The Atlantic,0.001784,0.001,0.0003,0.001116,0.000762,0.000698,0.001106,0.000379,0.000281,...,0.001131,0.000543,0.001297,0.00069,0.001171,0.000489,0.000627,0.001904,0.000498,0.001406
4,The Economist,0.001077,0.001623,0.000568,0.001123,0.000796,0.002706,0.001767,0.000443,0.000914,...,0.001379,0.000729,0.001454,0.001177,0.000963,0.00093,0.001504,0.001232,0.000676,0.002167
5,Wired,0.000638,0.000751,0.001426,0.000334,0.00095,0.000857,0.000835,0.000892,0.000662,...,0.002079,0.000769,0.007508,0.00056,0.001588,0.000978,0.001196,0.001739,0.002421,0.001907
6,Wall Street Journal,0.002055,0.003996,0.002182,0.002279,0.002427,0.002813,0.001742,0.001551,0.003374,...,0.001057,0.001865,0.003329,0.001776,0.001519,0.001769,0.001668,0.002131,0.001381,0.004342
7,Business Insider,0.001241,0.002518,0.002545,0.000997,0.000951,0.000993,0.001445,0.001326,0.004581,...,0.000858,0.002731,0.006354,0.000534,0.001667,0.001183,0.003529,0.001331,0.004414,0.001854
8,New York Times,0.00334,0.003165,0.000935,0.003486,0.00379,0.001922,0.002553,0.001423,0.000989,...,0.001909,0.001639,0.002481,0.001551,0.001541,0.001665,0.001366,0.003393,0.001265,0.005753
9,The Washington Post,0.004698,0.002791,0.00047,0.00311,0.002152,0.001385,0.002808,0.000885,0.000628,...,0.001283,0.001097,0.002068,0.001056,0.001048,0.001528,0.001055,0.002246,0.000695,0.003429


In [7]:
outlet_df.to_pickle('outlet_df.pickle')

In [8]:
#save as csv
outlet_df.to_csv('/Users/juliaqiao/Documents/Metis/NLP_News_Recommender/outlet_df.csv')

In [31]:
def News_Outlet_Recommender(outlet_df, Covid = 1, Staying_at_Home = 1, US_Politics = 1, Global_Politics = 1, Global_Economy = 1, Social_Issues = 1, Business = 1, Personal_Development =1, Hobbies=1):
#def News_Outlet_Recommender(outlet_df, Covid, Staying_at_Home, US_Politics, Global_Politics, Global_Economy, Social_Issues, Business, Personal_Development, Hobbies):
    """
    Takes in weights for each news theme and returns recommended news outlet based on topics behind each theme.
    """
    
    #define themes for our topics
    Covid_topics = ['Pandemic_Impact','Covid_Spread', 'Covid_Vaccine', 'Global_Lockdown']
    Staying_at_Home_topics = ['Work_From_Home', 'Education', 'Parenting']
    US_Politics_topics = ['Trump', 'Biden', 'US_Election', 'US_Supreme_Court']
    Global_Politics_topics = ['China', 'Brexit_EU', 'Hong_Kong']
    Global_Economy_topics = ['Economy', '(Un)employeement']
    Social_Issues_topics = ['Police_Protests', 'Gender_Equality', 'Climate_Change']
    Business_topics = ['Business', 'Stock_Market', 'Technology' ]
    Personal_Development_topics = [ 'Health_Wellness', 'Self-Help', 'Personal_Finance']
    Hobbies_topics = ['Cars']
    
    #define our average tweet
    average_tweet  = outlet_df.describe().iloc[1]
    
    #find each topic using the topic themes and multiply the average tweet's topic coefficient by the user weights, averaged out across theme. 
    for i in average_tweet.index:
        if str(i) in Covid_topics:
            weight = Covid/len(Covid_topics)
        elif str(i) in Staying_at_Home_topics:
            weight = Staying_at_Home/len(Staying_at_Home_topics)
        elif str(i) in US_Politics_topics:
            weight = US_Politics/len(US_Politics_topics)
        elif str(i) in Global_Politics_topics:
            weight = Global_Politics/len(Global_Politics_topics)
        elif str(i) in Global_Economy_topics:
            weight = Global_Economy/len(Global_Economy_topics)
        elif str(i) in Social_Issues_topics:
            weight = Social_Issues/len(Social_Issues_topics)
        elif str(i) in Business_topics:
            weight = Business/len(Business_topics)
        elif str(i) in Personal_Development_topics:
            weight = Personal_Development/len(Personal_Development_topics)
        elif str(i) in Hobbies_topics:
            weight = Hobbies
        else:
            weight = 1
        
        #apply weights to each average tweet topic coefficient, transforming vector into one that contains user topic preferences
        average_tweet[str(i)] = average_tweet[str(i)]*weight
        user_vector = average_tweet.values.reshape(1,-1)
    
    #compare the average tweet--now manipulated with user weights to be the user vector, to the outlet vectors in the dataframe
    similarity_matrix = []
    #computing the cosine similarity between each news vector( each row in outlet_df) and the user vector(average_tweet)    
    similarity_matrix = cosine_similarity(outlet_df.iloc[:,1:], user_vector)

    #find the max index of list
    max_index = np.argmax(similarity_matrix)
    #use max index to find its referred outlet
    similar_outlet = outlet_df.iloc[max_index, 0]
    
    #return the outlet with the highest cosine similarity to our user vector
    return similar_outlet



In [32]:
News_Outlet_Recommender(outlet_df, Covid = 0, Staying_at_Home = 1, US_Politics = 0, Global_Politics = 0, Global_Economy = 1, Social_Issues = 2, Business = 1, Personal_Development =2, Hobbies=0)

'Medium'

In [28]:
del News_Outlet_Recommender