In [1]:
import re
import os
import pickle
import pandas as pd

### Post Data Cleaning

In [2]:
def read_pickle(file_path):
    """
    args:
        file_path : pass the file location
    returns:
        data from the pickle file
    function:
        load the pickle file and returns the data
    """
    with open(file_path,"rb") as handle:
        data=pickle.load(handle)
    return data

post_data_dir = "../data/linkedincrawldata/post_data_final"
post_files = os.listdir(post_data_dir)
post_files = [os.path.join(post_data_dir,file_) for file_ in post_files]

post_data = []
for file_ in post_files:
    data = read_pickle(file_)
    post_data.extend(data)

post_data_df = pd.DataFrame(post_data)
post_data_df.head(2)

Unnamed: 0,unique_id,name,profile_link,content,likes,comments,shares,time_posted,url_link
0,f7f09bda-eced-4a7e-9294-5582e265e551,James Pastore,https://www.linkedin.com/in/james-pastore-2b92...,Extremely pertinent this time of the school ye...,3363,106 comments,0 shares,1w •\n1 week ago,https://media-exp1.licdn.com/dms/image/C4E22AQ...
1,babe72d6-40ff-417f-b9e0-872092a4c50b,SoilCup- Connecting Entrepreneurs,https://www.linkedin.com/in/soilcup,#meaningful,1,0 comments,0 shares,5d •\n5 days ago,


In [3]:
# step 1: copy if share in comments to shares
# step 2: if share is present in comments then replace with 0

def copy_shares(comment,shares):
    if 'share' in comment:
        return comment
    else:
        return shares

post_data_df["shares"]=post_data_df.apply(lambda x:copy_shares(x["comments"],x["shares"]),axis=1)

def replace_comments(comments):
    if 'share' in comments:
        return '0 comments'
    else:
        return comments

post_data_df["comments"] = post_data_df["comments"].apply(lambda x:replace_comments(x))
post_data_df["comments"] = post_data_df["comments"].apply(lambda x:int(re.sub(",","",x.split(" ")[0])))
post_data_df["shares"] = post_data_df["shares"].apply(lambda x:int(re.sub(",","",x.split(" ")[0])))
post_data_df["likes"] = post_data_df["likes"].apply(lambda x: int(re.sub(",","",x.split("\n")[0])))

In [4]:
post_data_df["is_promoted"]=post_data_df["time_posted"].apply(lambda x: 1 if x=="Promoted" else 0)
post_data_df["time_posted"]=post_data_df["time_posted"].apply(lambda x: "-1h •\n-1 hour ago" if x=="Promoted" else x )
post_data_df["time_posted"]=post_data_df["time_posted"].apply(lambda x:x.split(" ")[0])

In [6]:
post_data_df.to_pickle("../data/cleaned_post_data.pickle")

### Profile Data Cleaning

In [8]:
profile_data_dir = "../data/linkedincrawldata/profile_data"
profile_files = os.listdir(profile_data_dir)
profile_files = [os.path.join(profile_data_dir,file_) for file_ in profile_files]

profile_data = []
for file_ in profile_files:
    data = read_pickle(file_)
    profile_data.extend(data)

In [13]:
profile_data_df=pd.DataFrame(profile_data)
profile_data_df.head(2)

Unnamed: 0,link,followers,connections,profile_description
0,https://www.linkedin.com/in/priyakanagarajan,"8,030 followers",500+ connections,EA to VP | Site Engagement BP
1,https://www.linkedin.com/in/peter-erdei-30b51a134,"11,440 followers",500+ connections,"At Erdei Designs, we utilize old-world craftsm..."


In [14]:
profile_data_df["followers"]=profile_data_df["followers"].apply(lambda x:"0 followers" if x==-1 or x=="" else x)
profile_data_df["connections"]=profile_data_df["connections"].apply(lambda x:"0 connections" if x==-1 or x=="" else x)

In [15]:
profile_data_df["followers"] =profile_data_df["followers"].apply(lambda x:int(re.sub(",","",x.split(" ")[0])))
profile_data_df["connections"] =profile_data_df["connections"].apply(lambda x:int(re.sub("\+","",x.split(" ")[0])))


In [17]:
profile_data_df.to_pickle("../data/cleaned_profile_data.pickle")