In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px


In [None]:
df = pd.read_csv("/kaggle/input/udemy-courses-dataset/UdemyCoursesDataset.csv")

df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df['price'].replace('Free',0, inplace=True)

In [None]:
df['price']=df['price'].astype(float)

In [None]:
df['price'].dtype

In [None]:
import re


df['content_duration'] = df['content_duration'].str.lower()


def extract_duration(x):
    if 'hour' in x:
        match = re.search(r'(\d+(\.\d+)?)', x)
        return float(match.group(1)) if match else None
    elif 'min' in x:
        match = re.search(r'(\d+(\.\d+)?)', x)
        return float(match.group(1)) if match else None
    else:
        try:
            return float(x)
        except:
            return None

df['content_duration'] = df['content_duration'].apply(extract_duration)


In [None]:
df['content_duration']

In [None]:
df['published_timestamp']=pd.to_datetime(df['published_timestamp'])

In [None]:
df['published_timestamp'].dtype

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.fillna(df.mode(),inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
for x in df.index:
    if (df.loc[x,'num_lectures']==0):
        df.drop(x,inplace=True)

In [None]:
df['course_id'].nunique()

In [None]:
df.drop(columns=['course_id'],inplace=True)

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df[ 'published_timestamp']=pd.to_datetime(df[ 'published_timestamp'])
df['year']=df[ 'published_timestamp'].dt.year

In [None]:
df

In [None]:
df['month']=df[ 'published_timestamp'].dt.month

In [None]:
df

In [None]:
# Average number of subscribers by subject
df.groupby('subject')['num_subscribers'].mean().sort_values(ascending=False)

In [None]:
#Correlation between price and number of subscribers
df[['price','num_subscribers']].corr()

In [None]:
#Most common course level
df['level'].value_counts()

In [None]:
#Top 5 longest courses by content duration
df[['content_duration','course_title']].sort_values(by='content_duration',ascending=False)

In [None]:
#Average price by course level
df.groupby('level')['price'].mean().sort_values(ascending=False)

In [None]:
#Count of courses per subject
sns.countplot(data=df,x='subject',order=df['subject'].value_counts().index)
plt.title('Number of Courses by Subject')
plt.xticks(rotation=20)
plt.show()

In [None]:
#Boxplot of course price by subject
sns.boxplot(data=df,x='subject',y='price')
plt.title('Course Price Distribution by Subject')
plt.xticks(rotation=20)
plt.show()

In [None]:
#Distribution of number of subscribers
sns.histplot(df['num_subscribers'],bins=50,kde=True)
plt.title('Distribution of Number of Subscribers')
plt.xlabel('Subscribers')
plt.ylabel('Frequency')
plt.show()

In [None]:
#Relationship between content duration and subscribers
sns.scatterplot(data=df,y='num_subscribers',x='content_duration',hue='is_paid')
plt.title('Subscribers vs. Content Duration')
plt.show()

In [None]:
#Percentage of courses by level
plt.pie(df['level'].value_counts(),labels=df['level'].value_counts().index,autopct='%1.2f%%',shadow=True)
plt.title('Percentage of courses by level')
plt.legend(loc='upper right')
plt.axis('equal')
plt.show()


In [None]:
fig = px.histogram(df, x='is_paid', color='is_paid',
                   title='Paid and Free Courses Count',
                   labels={'is_paid': 'Is Paid'})

fig.show()

In [None]:
fig = px.scatter(df, x='price', y='num_subscribers',
                 title='Price vs Number of Subscribers')
fig.show()


In [None]:
num_df=df.select_dtypes(exclude=np.object_ )
plt.figure(figsize=(10,10))
sns.heatmap(num_df.corr(),annot=True)
plt.show()

In [None]:
sns.countplot(x='is_paid',data=df)
plt.title('paid and free courses')
plt.show()

In [None]:
plt.pie(df['is_paid'].value_counts(),labels=('paid','free'),autopct='%1.2f%%',shadow=True,explode=[0,0.15])
plt.title('paid and free courses')
plt.axis('equal')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(x='is_paid',y='num_subscribers',data=df)
plt.title('Average demand(free/paid)')
plt.show()

In [None]:
df['subject'].value_counts()

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(data=df,x='subject',order=df.subject.value_counts().index)
plt.title('Number of Courses per Subject')
plt.show()

In [None]:
plt.pie(df['subject'].value_counts(),labels=df['subject'].value_counts().index,autopct='%1.2f%%',shadow=True,explode=[0.1,0,0,0])
plt.title('number of courses in each subject')
plt.axis('equal')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(x='subject' ,y='num_subscribers',data=df )
plt.title('Average demand by course subject')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(x='subject' ,y='num_subscribers',hue='is_paid',data=df )
plt.title('Average demand by course subject')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(data=df,x='level',order=df.level.value_counts().index)
plt.title('Number of Courses by Difficulty Level')
plt.show()

In [None]:
plt.figure(figsize=(7,7))
plt.pie(df['level'].value_counts(),labels=['All Levels','Beginners','Intermediate','Expert'],autopct='%1.f%%',shadow=True,explode=[0,0,0,0.2])
plt.title('the precentage of each level')
plt.legend()
plt.axis('equal')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=df,x='price', bins=20)
plt.title('Distribution of course prices')
plt.show()

In [None]:
sns.scatterplot(x='price',y='num_subscribers',data=df)
plt.show()

In [None]:
sns.scatterplot(x='num_lectures',y='num_subscribers',data=df)
plt.show()

In [None]:
sns.scatterplot(x='content_duration',y='num_subscribers',data=df)
plt.show()

In [None]:
plt.plot(df.groupby('year')['price'].sum(),ls='-',lw=3,alpha=1)
plt.xlabel('years',color='k',size=12)
plt.ylabel('price',color='k',size=12)
plt.title('Total price of courses over the years')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(data=df,x='month')
plt.title('Distribution of courses published over months')
plt.xticks(range(0,12),['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.show()

In [None]:
z=df.loc[df.num_subscribers.sort_values(ascending=False).head().index]
sns.catplot(data=z,kind='bar',height=8,aspect=2.4,x='course_title',y='num_subscribers')
plt.show()

In [None]:
z=df.loc[df.num_reviews.sort_values(ascending=False).head().index]
sns.catplot(data=z,kind='bar',height=8,aspect=2.4,x='course_title',y='num_reviews')
plt.show()

In [None]:
plt.figure(figsize=(16,8))
sns.countplot(data=df,x='price')
plt.show()


In [None]:
z=df.loc[df.price.sort_values(ascending=False).head(20).index]
sns.catplot(data=z,kind='bar',height=8,aspect=2.4,y='course_title',x='price')
plt.show()

In [None]:
z=df.loc[df.num_lectures.sort_values(ascending=False).head(10).index]
sns.catplot(data=z,kind='bar',height=8,aspect=2.5,y='course_title',x='num_lectures')
plt.show()

In [None]:
z=df.loc[df.content_duration.sort_values(ascending=False).head(10).index]
sns.catplot(data=z,kind='bar',height=8,aspect=2.5,y='course_title',x='content_duration')
plt.show()