# Team Members:

Jetendra Mulinti, Goutham Vemula, Prajeeth Nakka

Introduction: We are trying create a model which estimates shares of an online article

Steps we are following
1. Loading data
2. Cleaning data
3. EDA
4. Feature selection & PCA
5. Normalizing the data
6. Train the model with linear regression

In [2]:
### Import packages

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.options.display.max_columns = 999
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import os

### sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

###### Standardization of the variables
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

#### Modelling
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, KFold

#### PCA
from sklearn.decomposition import PCA

#### Automated EDA
import sweetviz
from autoviz.AutoViz_Class import AutoViz_Class

Imported v0.1.803. After importing autoviz, you must run '%matplotlib inline' to display charts inline.
    AV = AutoViz_Class()
    dfte = AV.AutoViz(filename, sep=',', depVar='', dfte=None, header=0, verbose=1, lowess=False,
               chart_format='svg',max_rows_analyzed=150000,max_cols_analyzed=30, save_plot_dir=None)


# Loading Data

In [3]:
### load data from csv via github url

github_url = 'https://raw.githubusercontent.com/JetendraMulinti/DAV-6150---DataScience/main/M4%20Practical%20Challenge%20Feature%20Selection%20%26%20Dimension%20Reduction/M4_Data.csv'

news_df = pd.read_csv(github_url)
news_df.head(2)

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,kw_min_min,kw_max_min,kw_avg_min,kw_min_max,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,kw_avg_avg,self_reference_min_shares,self_reference_max_shares,self_reference_avg_sharess,weekday_is_monday,weekday_is_tuesday,weekday_is_wednesday,weekday_is_thursday,weekday_is_friday,weekday_is_saturday,weekday_is_sunday,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-video-browser/,731,12,219,0.663594,1.0,0.815385,4,2,1,0,4.680365,5,0,1,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,496.0,496.0,496.0,1,0,0,0,0,0,0,0,0.500331,0.378279,0.040005,0.041263,0.040123,0.521617,0.092562,0.045662,0.013699,0.769231,0.230769,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593
1,http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/,731,9,255,0.604743,1.0,0.791946,3,1,1,0,4.913725,4,0,0,1,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0.799756,0.050047,0.050096,0.050101,0.050001,0.341246,0.148948,0.043137,0.015686,0.733333,0.266667,0.286915,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711


In [4]:
print("No of duplicate rows: ",news_df.duplicated().sum())

No of duplicate rows:  0


In [5]:
# Strip leading and trailing spaces from column names
news_df.columns = news_df.columns.str.strip()

In [6]:
# Define the function
def columns_with_nulls_above_threshold(df, threshold=0.1):
    
    # Calculate the percentage of nulls for each column
    null_percentage = df.isnull().sum()
    
    # Identify columns where the percentage of nulls is above the threshold
    columns_above_threshold = null_percentage[null_percentage > threshold].index.tolist()
    
    return columns_above_threshold


### calling function
columns_with_nulls_above_threshold(news_df)

[]

In [7]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 61 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   url                            39644 non-null  object 
 1   timedelta                      39644 non-null  int64  
 2   n_tokens_title                 39644 non-null  int64  
 3   n_tokens_content               39644 non-null  int64  
 4   n_unique_tokens                39644 non-null  float64
 5   n_non_stop_words               39644 non-null  float64
 6   n_non_stop_unique_tokens       39644 non-null  float64
 7   num_hrefs                      39644 non-null  int64  
 8   num_self_hrefs                 39644 non-null  int64  
 9   num_imgs                       39644 non-null  int64  
 10  num_videos                     39644 non-null  int64  
 11  average_token_length           39644 non-null  float64
 12  num_keywords                   39644 non-null 

#### As the data tells & mention in data url, there are no missing values.

#  EDA

In [8]:
def plot_correlation_matrix(df, pdf_filename, target_variable):

    # Path for saving the PDF in the current working directory
    pdf_path = os.path.join(os.getcwd(), pdf_filename)
    
    # Select only quantitative columns for the correlation matrix
    quantitative_columns = df.select_dtypes(include=['float64', 'int64'])
    corr_matrix = quantitative_columns.corr()
    
    # Dynamically determine the size of the figure based on the number of columns
    n = len(corr_matrix.columns)
    fig_size = max(10, n)  # Ensure minimum size for readability
    
    with PdfPages(pdf_path) as pdf:
        # Set up the matplotlib figure with dynamic size
        fig, ax = plt.subplots(figsize=(fig_size, fig_size))
        
        # Generate a mask for the upper triangle
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        
        # Generate a custom diverging colormap
        cmap = sns.diverging_palette(230, 20, as_cmap=True)
        
        # Draw the heatmap with the mask and correct aspect ratio
        sns.heatmap(corr_matrix, mask=mask, cmap=cmap, vmax=.3, center=0,
                    square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)
        
        plt.title('Correlation Matrix of Quantitative Columns')
        
        plt.tight_layout()
        pdf.savefig(fig)  # Save the figure into the PDF
        plt.show()  # Display the plot
        plt.close(fig)  # Close to free up memory
        
        
    # Identifying high and low correlation columns
    high_corr_threshold = 0.5
    low_corr_threshold = -0.5
    target_corr = corr_matrix[target_variable]
    
    high_corr_columns = target_corr[target_corr >= high_corr_threshold].index.tolist()
    low_corr_columns = target_corr[target_corr <= low_corr_threshold].index.tolist()
    
    # Ensure the target variable is not included in the lists
    high_corr_columns = [col for col in high_corr_columns if col != target_variable]
    low_corr_columns = [col for col in low_corr_columns if col != target_variable]
    
    print("high_correlation columns: ", high_corr_columns)
    print("low_correlation columns: ", low_corr_columns)
    
    return high_corr_columns, low_corr_columns


In [9]:
plot_correlation_matrix(news_df,'CorrelationMatrix.pdf','shares')

high_correlation columns:  []
low_correlation columns:  []


([], [])

In [10]:
###### Automated EDA (Sweetviz)

viz_report = sweetviz.analyze(news_df, target_feat = 'shares')
viz_report.show_html('sweetviz_report.html')

                                             |                                             | [  0%]   00:00 ->…

Report sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [11]:
viz = AutoViz_Class() 
df = viz.AutoViz(filename = '',dfte= news_df, verbose =1)

Shape of your Data Set loaded: (39644, 61)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    Number of Numeric Columns =  34
    Number of Integer-Categorical Columns =  12
    Number of String-Categorical Columns =  0
    Number of Factor-Categorical Columns =  0
    Number of String-Boolean Columns =  0
    Number of Numeric-Boolean Columns =  14
    Number of Discrete String Columns =  0
    Number of NLP String Columns =  1
    Number of Date Time Columns =  0
    Number of ID Columns =  0
    Number of Columns to Delete =  0
    61 Predictors classified...
        No variables removed since no ID or low-information variables found in data set
30 numeric variables in data exceeds limit, taking top 30 variables
    List of variables

Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
url,object,0.0,100.0,,,No issue
timedelta,int64,0.0,1.0,8.0,731.0,No issue
n_tokens_title,int64,0.0,0.0,2.0,23.0,Column has 156 outliers greater than upper bound (16.50) or lower than lower bound(4.50). Cap them or remove them.
n_tokens_content,int64,0.0,6.0,0.0,8474.0,Column has 1933 outliers greater than upper bound (1421.00) or lower than lower bound(-459.00). Cap them or remove them.
n_unique_tokens,float64,0.0,,0.0,701.0,Column has 1604 outliers greater than upper bound (0.82) or lower than lower bound(0.26). Cap them or remove them.
n_non_stop_words,float64,0.0,,0.0,1042.0,"Column has 2328 outliers greater than upper bound (1.00) or lower than lower bound(1.00). Cap them or remove them., Column has a high correlation with ['n_unique_tokens']. Consider dropping one of them."
n_non_stop_unique_tokens,float64,0.0,,0.0,650.0,"Column has 1733 outliers greater than upper bound (0.95) or lower than lower bound(0.43). Cap them or remove them., Column has a high correlation with ['n_unique_tokens', 'n_non_stop_words']. Consider dropping one of them."
num_hrefs,int64,0.0,0.0,0.0,304.0,Column has 2169 outliers greater than upper bound (29.00) or lower than lower bound(-11.00). Cap them or remove them.
num_self_hrefs,int64,0.0,0.0,0.0,116.0,Column has 2090 outliers greater than upper bound (8.50) or lower than lower bound(-3.50). Cap them or remove them.
num_imgs,int64,0.0,0.0,0.0,128.0,Column has 7703 outliers greater than upper bound (8.50) or lower than lower bound(-3.50). Cap them or remove them.


Number of All Scatter Plots = 465
Image size of 1500x87200 pixels is too large. It must be less than 2^16 in each direction.
Could not draw Pair Scatter Plots


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\JETENDRA\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\JETENDRA\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\JETENDRA\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\JETENDRA\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\JETENDRA\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to

All Plots done
Time to run AutoViz = 176 seconds 

 ###################### AUTO VISUALIZATION Completed ########################


### Based on the above EDA we are having many columns with the outliers so we are using Transformation techniques, post that we are using normalizing 


In [12]:
###### save the original data

original_df = news_df.copy(deep=True)
original_df.head(2)

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,kw_min_min,kw_max_min,kw_avg_min,kw_min_max,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,kw_avg_avg,self_reference_min_shares,self_reference_max_shares,self_reference_avg_sharess,weekday_is_monday,weekday_is_tuesday,weekday_is_wednesday,weekday_is_thursday,weekday_is_friday,weekday_is_saturday,weekday_is_sunday,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-video-browser/,731,12,219,0.663594,1.0,0.815385,4,2,1,0,4.680365,5,0,1,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,496.0,496.0,496.0,1,0,0,0,0,0,0,0,0.500331,0.378279,0.040005,0.041263,0.040123,0.521617,0.092562,0.045662,0.013699,0.769231,0.230769,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593
1,http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/,731,9,255,0.604743,1.0,0.791946,3,1,1,0,4.913725,4,0,0,1,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0.799756,0.050047,0.050096,0.050101,0.050001,0.341246,0.148948,0.043137,0.015686,0.733333,0.266667,0.286915,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711


In [13]:
### deleting url column

del news_df['url']

In [14]:
#### Capping (For Notmally distributed data)
#### We considered lower_percentile, upper_percentile as default values. 

def cap_outliers(series, lower_percentile=0.05, upper_percentile=0.95):
    """
    Caps outliers in a pandas series to specified percentile thresholds.
    """
    quantiles = series.quantile([lower_percentile, upper_percentile])
    series_capped = series.clip(quantiles[lower_percentile], quantiles[upper_percentile])
    return series_capped


news_df_modified = original_df.drop(['url'], axis=1)
CappingColumns = news_df_modified.columns.tolist()


# Applying the function to cap outliers in the dataset
for column in CappingColumns:
    news_df[column] = cap_outliers(news_df[column])

In [15]:
news_df.head()

Unnamed: 0,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,kw_min_min,kw_max_min,kw_avg_min,kw_min_max,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,kw_avg_avg,self_reference_min_shares,self_reference_max_shares,self_reference_avg_sharess,weekday_is_monday,weekday_is_tuesday,weekday_is_wednesday,weekday_is_thursday,weekday_is_friday,weekday_is_saturday,weekday_is_sunday,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,697,12,219,0.663594,1.0,0.815385,4,2,1,0,4.680365,5,0,1,0,0,0,0,0,91.15,36.2,0,69100,28316.66667,0.0,3254.922019,1783.750391,496.0,496.0,496.0,1,0,0,0,0,0,0,0,0.500331,0.378279,0.040005,0.041263,0.040123,0.521617,0.092562,0.045662,0.013699,0.769231,0.230769,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593
1,697,9,255,0.604743,1.0,0.791946,3,1,1,0,4.913725,4,0,0,1,0,0,0,0,91.15,36.2,0,69100,28316.66667,0.0,3254.922019,1783.750391,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0.799756,0.050047,0.050096,0.050101,0.050001,0.341246,0.148948,0.043137,0.015686,0.733333,0.266667,0.286915,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711
2,697,9,211,0.57513,1.0,0.663866,3,1,1,0,4.393365,6,0,0,1,0,0,0,0,91.15,36.2,0,69100,28316.66667,0.0,3254.922019,1783.750391,918.0,918.0,918.0,1,0,0,0,0,0,0,0,0.217792,0.033334,0.033351,0.033334,0.682188,0.600264,0.27633,0.056872,0.009479,0.857143,0.142857,0.495833,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0,1500
3,697,9,531,0.503788,1.0,0.665635,9,0,1,0,4.404896,7,0,1,0,0,0,0,0,91.15,36.2,0,69100,28316.66667,0.0,3254.922019,1783.750391,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0.028573,0.4193,0.494651,0.028905,0.028572,0.42985,0.100705,0.041431,0.020716,0.666667,0.333333,0.385965,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0,1200
4,697,13,1072,0.415646,1.0,0.54089,19,9,20,0,4.682836,7,0,0,0,0,1,0,0,91.15,36.2,0,69100,28316.66667,0.0,3254.922019,1783.750391,545.0,16000.0,3151.157895,1,0,0,0,0,0,0,0,0.028633,0.028794,0.028575,0.028572,0.883641,0.513502,0.27633,0.068373,0.012127,0.860215,0.139785,0.411127,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364,584


# Post transforming outliers

In [16]:
## Feature Selection 

## For Feature selection we are using PCA, instead of manual removal (reason: most columns are low coorelated)

######### Split the data into Train, Test 

X = news_df.drop(['shares'], axis=1)

y = news_df[['shares']]

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Shape of X_train & X_test: ",X_train.shape, X_test.shape)
print("Shape of y_train & y_test: ",y_train.shape, y_test.shape)

Shape of X_train & X_test:  (27750, 59) (11894, 59)
Shape of y_train & y_test:  (27750, 1) (11894, 1)


In [17]:
### Normalize the data (to have all values on same measure)

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# PCA

In [18]:
######## PCA n components 10

pca = PCA(n_components=10)
pca

In [19]:
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [20]:
# Initialize and train the regression model
model = LinearRegression()
model.fit(X_train_pca, y_train)

In [21]:
### Predict on the testing set

y_pred = model.predict(X_test_pca)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')


Mean Squared Error: 6333965.227309034
Root Mean Squared Error: 2516.737019894815
R-squared: 0.09248266650726722


## Conclusion: 
We used the PCA 10 components (after cross checking with 3, 5, 10), it is giving best R2 Value (Yet it is low).