In [1]:
import pandas as pd

In [22]:
!wget https://www.dropbox.com/s/hxs7bm2pki5nez0/tech-reviews.csv -o tech-reviews.csv

In [2]:
df = pd.read_csv("../data/tech-reviews.csv", parse_dates=["date"])

In [3]:
df.head()

Unnamed: 0,_id,score,publication,category,article,author,date,title
0,https://www.techradar.com/reviews/razer-opus,4.5,TechRadar,Audio Visual,The Razer Opus is Razer's first foray into a n...,Bill Thomas,2020-05-19 00:00:00,Razer Opus review
1,https://www.techradar.com/reviews/huawei-freel...,4.5,TechRadar,Audio Visual,The Huawei FreeLace Pro offer a surprisingly i...,Lee Bell,2020-10-02 00:00:00,Huawei FreeLace Pro review
2,https://www.techradar.com/reviews/hp-reverb-g2,4.5,TechRadar,Computing,The HP Reverb G2 is easily one of the best VR ...,Matt Hanson,2020-12-21 00:00:00,HP Reverb G2 review
3,https://www.techradar.com/reviews/sennheiser-a...,4.5,TechRadar,Televisions,"It’s seriously expensive, but immersive Dolby ...",Cliff Joseph,2019-07-05 00:00:00,Sennheiser Ambeo 3D Soundbar review
4,https://www.techradar.com/reviews/arlo-video-d...,4.5,TechRadar,Smart Home,The Arlo Video Doorbell may be cheaper than th...,Christian de Looper,2020-12-30 00:00:00,Arlo Video Doorbell review


In [4]:
df.dtypes

_id             object
score          float64
publication     object
category        object
article         object
author          object
date            object
title           object
dtype: object

In [5]:
print(df.columns)
print("\n")
print(f'dataframe shape: {df.shape} \n')
print(f'Number of unique values in each cloumn: \n {df.nunique()}\n')


Index(['_id', 'score', 'publication', 'category', 'article', 'author', 'date',
       'title'],
      dtype='object')


dataframe shape: (28092, 8) 

Number of unique values in each cloumn: 
 _id            28092
score             63
publication        4
category        3470
article        28077
author           721
date           12288
title          27270
dtype: int64



In [6]:
df.isnull().sum()

_id               0
score             0
publication       0
category       7293
article           0
author            0
date              0
title             0
dtype: int64

In [7]:
df['brand'] = df['title'].str.split().str[0]
dict(df["brand"].value_counts())

{'Samsung': 1042,
 'Sony': 756,
 'Asus': 534,
 'HP': 532,
 'LG': 501,
 'Lenovo': 483,
 'Canon': 465,
 'Acer': 428,
 'Dell': 425,
 'Panasonic': 406,
 'Epson': 290,
 'Apple': 259,
 'Amazon': 241,
 'Razer': 233,
 'Microsoft': 230,
 'Nikon': 222,
 'Philips': 219,
 'Google': 216,
 'Huawei': 213,
 'Logitech': 196,
 'Toshiba': 177,
 'MSI': 175,
 'Brother': 175,
 'BenQ': 173,
 'The': 171,
 'JBL': 151,
 'Fujifilm': 150,
 'Garmin': 136,
 'AMD': 134,
 '2019': 131,
 'HTC': 130,
 'Motorola': 120,
 'Nokia': 114,
 'Sennheiser': 107,
 'Netgear': 106,
 '2018': 103,
 'Bose': 100,
 'Olympus': 99,
 'Corsair': 99,
 'Vizio': 94,
 'Xiaomi': 93,
 '2016': 92,
 'Gigabyte': 88,
 'Intel': 86,
 'Moto': 83,
 'Optoma': 82,
 '2015': 80,
 'A': 80,
 'Nvidia': 79,
 'Pentax': 77,
 'Sigma': 77,
 'ZTE': 76,
 'Alienware': 75,
 '2017': 75,
 'Hisense': 75,
 'Oppo': 72,
 'Leica': 72,
 'D-Link': 71,
 'Adobe': 68,
 'OnePlus': 67,
 'Dyson': 67,
 'Honor': 65,
 'SteelSeries': 64,
 'Fitbit': 63,
 'Creative': 63,
 'TCL': 61,
 'Marsha

In [8]:
df['category'] = df['category'].str.lower()

In [9]:
individual_categories = [category for category in df['category'].dropna() if '[' not in category]
individual_categories = set(individual_categories)
individual_categories.update({"kitchen and household","auto tech","laptops", 'printers','desktop pcs','projectors','monitors','scanners','keyboards','security'})

In [10]:
individual_categories

{'appliances',
 'audio visual',
 'auto tech',
 'cameras',
 'car tech',
 'celulares',
 'components',
 'computing',
 'desktop pcs',
 'entertainment',
 'fitness',
 'gaming',
 'informática',
 'keyboards',
 'kitchen and household',
 'laptops',
 'mobile phones',
 'monitors',
 'networking',
 'printers',
 'projectors',
 'scanners',
 'security',
 'smart home',
 'software',
 'tablets',
 'televisions',
 'televisori',
 'wearables'}

In [11]:
import ast
import re

def replace_categories(row):
    row_category = row['category']
    
    if pd.isna(row_category):
        return row_category

    # Check if the category is a single word in individual_categories
    if row_category in individual_categories:
        return row_category

    # Try to evaluate the category as a list
    try:
        row_list = ast.literal_eval(row_category)
    except (SyntaxError, ValueError):
        # Handle the case where row_category is not a valid list literal
        return None
    
    # Check if the category matches any individual category
    for category in individual_categories:
        pattern = r'\b{}.'.format(category[:3])
        if any(re.search(pattern, word) for word in row_list):
            return category

    # Return the last item in the list, if the list is not empty
    if row_list:
        return row_list[-1]
    
    # Return None if the category is an empty list
    return None


In [12]:
df['category'] = df.apply(replace_categories,axis=1)

In [13]:
df['publication'].unique()

array(['TechRadar', 'CNET', 'PCMag', 'Trusted Reviews'], dtype=object)

In [28]:
!pip install plotly_express

Collecting plotly_express
  Downloading plotly_express-0.4.1-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading plotly_express-0.4.1-py2.py3-none-any.whl (2.9 kB)
Installing collected packages: plotly_express
Successfully installed plotly_express-0.4.1


In [29]:
dbt_df = pd.read_csv("../data/dbt_models_core_combined_categories.csv")

In [42]:
num_reviews = list(dbt_df['num_reviews'])

In [45]:
num_reviews

[3959, 2932, 2445, 1554, 1088, 1077, 697, 656, 522, 396]

In [50]:
category = list(dbt_df['combined_category'])

In [51]:
category

['COMPUTERS',
 'MOBILE & TABLETS',
 'AUDIO',
 'CAMERAS',
 'COMPONENTS',
 'SMART HOME',
 'GAMING',
 'PRINTERS',
 'PROJECTORS',
 'WEARABLES']

In [49]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatterpolar(
  r=[3959, 2932, 2445, 1554, 1088],
  theta=['COMPUTERS',
 'MOBILE & TABLETS',
 'AUDIO',
 'CAMERAS',
 'COMPONENTS'],
  fill='toself'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True
    ),
  ),
  showlegend=False
)

fig.show()