## Get Data

In [1]:
! pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
! mkdir ~/.kaggle

In [3]:
! cp kaggle.json ~/.kaggle/

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
! kaggle datasets download -d hossaingh/udemy-courses

Downloading udemy-courses.zip to /content
 99% 608M/613M [00:05<00:00, 112MB/s]
100% 613M/613M [00:05<00:00, 121MB/s]


In [6]:
! unzip udemy-courses.zip

Archive:  udemy-courses.zip
  inflating: Comments.csv            
  inflating: Course_info.csv         


## Prep Data

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('Comments.csv')

In [3]:
df.head()

Unnamed: 0,id,course_id,rate,date,display_name,comment
0,88962892,3173036,1.0,2021-06-29T18:54:25-07:00,Rahul,I think a beginner needs more than you think.\...
1,125535470,4913148,5.0,2022-10-07T11:17:41-07:00,Marlo,Aviva is such a natural teacher and healer/hea...
2,68767147,3178386,3.5,2020-10-19T06:35:37-07:00,Yamila Andrea,Muy buena la introducción para entender la bas...
3,125029758,3175814,5.0,2022-09-30T21:13:49-07:00,Jacqueline,This course is the best on Udemy. This breakd...
4,76584052,3174896,4.5,2021-01-30T08:45:11-08:00,Anthony,I found this course very helpful. It was full ...


In [4]:
reviews_df = df[['display_name', 'course_id', 'rate']]

In [5]:
reviews_df.head()

Unnamed: 0,display_name,course_id,rate
0,Rahul,3173036,1.0
1,Marlo,4913148,5.0
2,Yamila Andrea,3178386,3.5
3,Jacqueline,3175814,5.0
4,Anthony,3174896,4.5


In [6]:
reviews_df.describe().round(2)

Unnamed: 0,course_id,rate
count,9411727.0,9411727.0
mean,1589933.15,4.51
std,1100058.97,0.86
min,1769.0,0.5
25%,764164.0,4.0
50%,1333640.0,5.0
75%,2243978.0,5.0
max,4913148.0,5.0


In [12]:
reviews_df.isnull().sum()

display_name    75362
course_id           0
rate                0
dtype: int64

In [8]:
review_df = reviews_df.dropna()
review_df = reviews_df.drop_duplicates()

In [10]:
courses = review_df['course_id'].value_counts()
courses

567828     23031
793796     21477
914296     18245
1565838    17786
625204     17542
           ...  
3314984        1
4139034        1
4375656        1
2172942        1
3173036        1
Name: course_id, Length: 162995, dtype: int64

In [13]:
c_filter = courses[courses>500]
review_df = review_df.loc[review_df['course_id'].isin(c_filter.axes[0])]
review_df

Unnamed: 0,display_name,course_id,rate
2720695,Larry,1055720,5.0
2720696,Carley,1055720,5.0
2720697,Abby,1055720,4.0
2720698,Adolphus,1055720,5.0
2720699,Vivienne,1055720,5.0
...,...,...,...
9411719,Sathyanarayanan,567828,5.0
9411723,Rao Saurabh,567828,5.0
9411724,Raveesh,567828,5.0
9411725,Wendell,567828,5.0


In [14]:
reviewer_names = review_df['display_name'].value_counts()
reviewer_names

Anonymized         9524
David              6724
Daniel             5899
Michael            5684
John               5090
                   ... 
Gabriel Taborda       1
Joceli Miguel         1
Wesllen Santos        1
Jonatha Rihan         1
Rao Saurabh           1
Name: display_name, Length: 858473, dtype: int64

In [16]:
reviewer_filter = reviewer_names[(reviewer_names>3) & (reviewer_names<8)]
review_df = review_df.loc[review_df['display_name'].isin(reviewer_filter.axes[0])]
review_df

Unnamed: 0,display_name,course_id,rate
2720716,Jacynthe,1055720,5.0
2720731,Norval,1055720,5.0
2720733,Dovie,1055720,5.0
2720743,Lelah,1055720,4.0
2720758,Jany,1055720,4.0
...,...,...,...
9411631,Linet,567828,5.0
9411656,Drishtant,567828,5.0
9411665,Gerardo Rivera,567828,5.0
9411667,Claro,567828,5.0


In [17]:
groupby_reviewer = review_df.groupby(["display_name"])
delete = []
for user in groupby_reviewer.groups.keys():
     if groupby_reviewer.get_group(user).nunique()["course_id"] != groupby_reviewer.get_group(user).count()["course_id"]:
        delete.append(user)

review_df = review_df.loc[~review_df['display_name'].isin(delete)]
review_df

Unnamed: 0,display_name,course_id,rate
2720716,Jacynthe,1055720,5.0
2720731,Norval,1055720,5.0
2720758,Jany,1055720,4.0
2720764,Delmer,1055720,5.0
2720799,Merl,1055720,5.0
...,...,...,...
9411631,Linet,567828,5.0
9411656,Drishtant,567828,5.0
9411665,Gerardo Rivera,567828,5.0
9411667,Claro,567828,5.0


In [18]:
review_df['display_name'].value_counts()

Jacynthe           7
Gratian            7
Burçak             7
Samuel Kweku       7
Chander Shekhar    7
                  ..
Shatlyk            4
Harmi              4
Shoeib             4
J. Scott           4
Ningning           4
Name: display_name, Length: 54104, dtype: int64

In [19]:
review_df['course_id'].nunique()

2549

In [20]:
le = LabelEncoder()
reviewer_id = le.fit_transform(review_df["display_name"])
review_df.insert(0, "user_id", reviewer_id, True)
review_df

Unnamed: 0,user_id,display_name,course_id,rate
2720716,20305,Jacynthe,1055720,5.0
2720731,35157,Norval,1055720,5.0
2720758,20731,Jany,1055720,4.0
2720764,10938,Delmer,1055720,5.0
2720799,31554,Merl,1055720,5.0
...,...,...,...,...
9411631,27372,Linet,567828,5.0
9411656,12226,Drishtant,567828,5.0
9411665,16543,Gerardo Rivera,567828,5.0
9411667,9109,Claro,567828,5.0


In [21]:
review_df.to_csv("reviews_cleaned.csv", index=False)

In [28]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [22]:
!cp reviews_cleaned.csv "drive/My Drive/Project"

In [None]:
!cp Course_info.csv "drive/My Drive/Project"