
# Data Cleaning

In [1]:
#!pip install pandas
#!pip install numpy
#!pip install nltk
#!pip install multiprocess
!pip install emoji

# Import statements
import zipfile
import os

import shutil

import pandas as pd
import numpy as np 
import csv
import re
import string
import time
from datetime import datetime, timedelta, date


import multiprocess
from multiprocess import Pool, Process
import multiprocessing as mp



import emoji
import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english')) - set(['at', 'do', 'your', 'from', 'to', 'out', 'no', 'the'])

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# Extract zip file

def extract_zips(filename, temp_folder):
  zip_ref = zipfile.ZipFile(filename, 'r')
  zip_ref.extractall(temp_folder)
  zip_ref.close()

In [3]:
def create_filename(FILENAME, TEMP_FOLDER):
  FILES_NAMES = FILENAME.split(".")[0].split("/")[-1]
  FILES_NAMES = FILES_NAMES.rsplit("-", 1)[0]
  # FILES_NAMES = os.path.join(TEMP_FOLDER, FILES_NAMES)
  return FILES_NAMES

In [4]:
def get_date_string(FILESNAMES):
  date_string = FILESNAMES.split("-")[-1]
  return date_string

def strip_date(FILESNAMES):
  date_string = get_date_string(FILESNAMES)
  ob = datetime.strptime(date_string, "%Y%m%d")
  date_to_analyse = ob.strftime("%a %b %d")
  year = ob.strftime("%Y")
  return date_to_analyse, year

def get_date_numbers(date_string):
    ob = datetime.strptime(date_string, "%a %b %d")
    return ob.strftime("%d"), ob.strftime("%m")
    
def get_next_date(current_date,year):
    day, month = get_date_numbers(current_date)
    date = datetime(int(year), int(month), int(day))
    date += timedelta(days = 1)
    return date.strftime("%Y%m%d")
    
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)
        
# h01-20201001-20201008
def get_date_range(FILESNAMES):
    filesnames = FILESNAMES.split('.')[0]
    date_string = filesnames.split("-")[-2:]
    start_date = strip_date(date_string[0])
    end_date = strip_date(date_string[1])
    start_date, start_year, end_date, end_year = start_date[0], start_date[1], end_date[0], end_date[1]
    start_day, start_month = get_date_numbers(start_date)
    end_day, end_month = get_date_numbers(end_date)
    
    start_date = date(int(start_year), int(start_month), int(start_day))
    end_date = date(int(end_year), int(end_month), int(end_day))
    return start_date, end_date 

In [5]:
def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

In [6]:
def get_folder_name_from_zip(FILENAME):
    name = FILENAME.split('.')[0].split("/")[-1]
    return name
    

In [7]:
added_ids = {}

In [8]:
# Keep data in english only
def remove_other_langs(data):
  data = data[data['lang'] == 'en'].reset_index(drop=True)
  return data

In [9]:
def save_other_date(data, old_data, current_date, TEMP_FOLDER, FILES_NAMES, FOLDER_NAME):
    if data.equals(old_data):
        return
    date_to_save = get_next_date(current_date, year)
    print(date_to_save)
    #directory = "data"
    
    #directory = "/media/manikya_varshney/Backup/Yale Data/data"
    directory = '{}/{}'.format(TEMP_FOLDER, FOLDER_NAME)
    
    create_directory(directory)
    combined = pd.concat([data, old_data]).drop_duplicates(keep=False)
    
    combined.to_csv("{}/h01-{}.csv".format(directory, date_to_save))

In [10]:
# Keep specific date
def remove_other_dates(data, date_to_analyse, TEMP_FOLDER, FILES_NAMES, FOLDER_NAME):
  new_data = data[data['created_at'].str[:10] == date_to_analyse].reset_index(drop=True)
  save_other_date(new_data, data, date_to_analyse, TEMP_FOLDER, FILES_NAMES, FOLDER_NAME)
  return new_data

In [11]:
# Creating the RT Column
def create_rt_column(data):
  data['RT'] = data['text'].str[:2]=='RT'
  return data

In [12]:
# Parse CSV data
def parse_data_from_file(filename, date_to_analyse, TEMP_FOLDER, FILES_NAMES, FOLDER_NAME):
  print("reading file {}".format(filename))
  try:
    data = pd.read_csv(filename, index_col = None, header=0, engine = 'python',encoding = "utf-8")
    data = remove_other_langs(data)
    data = remove_other_dates(data, date_to_analyse, TEMP_FOLDER, FILES_NAMES, FOLDER_NAME)
    data = create_rt_column(data)
    return data
  except:
        return pd.DataFrame()

In [13]:
# Parse all files of the same date
def parse_all_files(TEMP_FOLDER, FILES_NAMES, date_to_analyse, FOLDER_NAME):
  files = os.listdir(os.path.join(TEMP_FOLDER, FOLDER_NAME))
  #print(files)
  new_data = []
  for file in files:
    if file.startswith(FILES_NAMES):
      parsed_data = parse_data_from_file(os.path.join(TEMP_FOLDER, FOLDER_NAME, file), date_to_analyse, TEMP_FOLDER, FILES_NAMES, FOLDER_NAME)
      new_data.append(parsed_data)
  return pd.concat(new_data, axis = 0)

In [14]:
def get_text_or_extended_text(data, added_ids):
  data["FINAL_TEXT"] = np.where(data['extended_tweet_full_text'].notnull(), data["extended_tweet_full_text"], data["text"])
  id = data["id"].to_dict()
  added_ids.update(id)
  return data["FINAL_TEXT"]

def get_quoted_text(data, added_ids):
  data["FINAL_TEXT"] = np.where(data['QT_full_text'].notnull(), data["QT_full_text"], data["QT_text"])
  added_ids.update(data["id"].to_dict())
  added_ids.update(data["QT_id"].to_dict())
  return data["FINAL_TEXT"]

def get_text_or_full_text_rt(data, added_ids):
  data["FINAL_TEXT"] = np.where(data['RT_full_text'].notnull(), data["RT_full_text"], data["RT_text"])
  added_ids.update(data["id"].to_dict())
  added_ids.update(data["RT_id"].to_dict())
  return data["FINAL_TEXT"]

def get_quote_rt_full(data, added_ids):
  data["FINAL_TEXT"] = get_text_or_full_text_rt(data, added_ids) + get_quoted_text(data, added_ids)
  return data["FINAL_TEXT"]



def get_quote_rt(data, is_quote, is_rt):
  data = data.loc[(data['is_quote_tweet'] == is_quote) & (data['RT'] == is_rt)]

  if not is_quote and not is_rt:
    added_ids.update(dict(zip(data["id"], data["is_quote_tweet"])))
    data["FINAL_TEXT"] = np.where(data['extended_tweet_full_text'].notnull(), data["extended_tweet_full_text"], data["text"])

  if is_quote and not is_rt:
    data["FINAL_TEXT"] = np.where(data["QT_id"].isin(added_ids.keys()), get_text_or_extended_text(data, added_ids), get_quoted_text(data, added_ids))

  if not is_quote and is_rt:
    data["FINAL_TEXT"] = np.where(data["RT_id"].isin(added_ids.keys()), added_ids.update(data['id'].to_dict()), get_text_or_full_text_rt(data, added_ids))
    data = data[data["FINAL_TEXT"].notna()]

  if is_quote and is_rt:
    data["FINAL_TEXT"] = np.where((data["RT_id"].isin(added_ids.keys())) & (data["QT_id"].isin(added_ids.keys())), added_ids.update(data['id'].to_dict()), None)
    data["FINAL_TEXT"] = np.where((data["RT_id"].isin(added_ids.keys())) & (~data["QT_id"].isin(added_ids.keys())), get_quoted_text(data, added_ids), None)
    data["FINAL_TEXT"] = np.where((~data["RT_id"].isin(added_ids.keys())) & (data["QT_id"].isin(added_ids.keys())), get_text_or_full_text_rt(data, added_ids), None)
    data["FINAL_TEXT"] = np.where((~data["RT_id"].isin(added_ids.keys())) & (~data["QT_id"].isin(added_ids.keys())), get_quote_rt_full(data, added_ids), None)
    data = data[data["FINAL_TEXT"].notna()]

  return data

In [15]:
def separate_df(filename, TEMP_FOLDER, FILES_NAMES, date_to_analyse, FOLDER_NAME):
    extract_zips(filename, TEMP_FOLDER)
    data = parse_all_files(TEMP_FOLDER, FILES_NAMES, date_to_analyse, FOLDER_NAME)

    non_quote_non_rt = get_quote_rt(data, False, False)
    quote_non_rt = get_quote_rt(data, True, False)
    quote_rt = get_quote_rt(data, True, True)
    non_quote_rt = get_quote_rt(data, False, True)
    
    return non_quote_non_rt, quote_non_rt, quote_rt, non_quote_rt
    

1. Casing (Upper or lower case)
2. Noise Removal (Removal of punctuation, white spaces, special characters, HTML tags)
3. Tokenization (Tweets to tokens i.e. words seprated by spaces)
4. Stopword Removal
5. Text Normalization (Stemming and Lemmatization) bold text

In [16]:
#Removing emojis
def demoji(text):
  emoji_pattern = re.compile("["
  u"\U0001F600-\U0001F64F"  # emoticons
                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                u"\U00002702-\U000027B0"
                u"\U000024C2-\U0001F251"
                u"\U0001f926-\U0001f937"
                u'\U00010000-\U0010ffff'
                u"\u200d"
                u"\u2640-\u2642"
                u"\u2600-\u2B55"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\u3030"
                u"\ufe0f"
                        "]+", flags=re.UNICODE)
  return(emoji_pattern.sub(r'', text.decode('utf-8')))

#POSTags
def get_wordnet_pos(word):
  """Map POS tag to first character lemmatize() accepts"""
  tag = nltk.pos_tag([word])[0][1][0].upper()
  tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
  return tag_dict.get(tag, wordnet.NOUN)

ps = PorterStemmer()

def apply_stemming(row):
  row_list = row["FINAL_TEXT"]
  stemmed_list = [ps.stem(word) for word in row_list]
  return (stemmed_list)

#Remove URLs, user@, punctutions
def df_cleaning(data_):
  punc = string.punctuation
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str).str.replace(r"http\S+| www\S+| https\S+| \S+\.com\S+| \S+\.com| \@[\w]+", "", regex=True)

  # ##################-------Punctutions-------##################
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str).str.translate(str.maketrans("", "", string.punctuation))
 
  # ##################-------More Cleaning-------##################
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str).str.replace("/[^a-zA-Z0-9 ]/g", "", regex=True).str.replace("\n"," ", regex=True).str.replace("—"," ", regex=True).str.strip("“").str.strip("”").str.strip("’").str.lstrip(" ").str.rstrip(" ")
  
  # ##################-------Emojis-------##################
#   data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str).apply(lambda x:demoji(x.encode('utf8')))
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str).apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))
  # ##################-------Tokenizing-------##################
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str)
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].apply(nltk.word_tokenize)
#   data_["FINAL_TEXT"] = data_.apply(lambda row: nltk.word_tokenize(row["FINAL_TEXT"]), axis=1)

  # ##################-------Lower characters---------##################
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].apply(lambda x: [word.lower() for word in x])

  # ##################-------Remove punctuations-------##################
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].apply(lambda x: [word for word in x if word not in punc])

  # ##################-------Removing stopwords-------##################
#   data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str).apply(lambda x: [word for word in x if word not in stop_words])
  
  # data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str).apply(lambda x: [word for word in x if word not in stop_words])
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].apply(lambda words: ' '.join(word for word in words if word not in stop_words))
 
  
  # ##################-------Stemming-------##################
  ps = PorterStemmer()
#   data_["FINAL_TEXT"] = data_["FINAL_TEXT"].apply(lambda x: [ps.stem(y) for y in x])
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].apply(lambda x: [ps.stem(y) for y in x.split(' ')])
#   data_["FINAL_TEXT"] = data_.apply(apply_stemming, axis = 1)
#   data_["FINAL_TEXT"] = data_.apply(ps.stem)

#   print(data_["FINAL_TEXT"], "from here")


  # ##################-------Lemmatizing-------##################
#   lemmatizer = WordNetLemmatizer()
#   data["FINAL_TEXT"] = data["FINAL_TEXT"].astype(str).apply(lambda x: [lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in x])
  
  # ##################-------Joining the lemmetized tokens to form string-------##################
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].apply(lambda x: ' '.join(x))
 
  # ##################-------Remove punctuations-------##################
#   data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str).str.translate(str.maketrans("", "", string.punctuation)).str.replace("’", " ").str.replace("“", " ").str.replace("”", " ")

#   final_df["FINAL_TEXT"] = data_["FINAL_TEXT"]
#   final_df = data_
  return data_


In [17]:
def split_dataframe(df, nums): 
    chunks = list()
    num_chunks = nums
    chunk_size = len(df) // nums
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

In [18]:
def clean_data_and_save(data_,filename, year):
    start = time.time()
    num_cores = mp.cpu_count()
    pool = Pool(num_cores)
    df = split_dataframe(data_, num_cores)
    data1 = pd.concat(pool.map(df_cleaning, df))
    data = data1.copy()
    pool.close()
    pool.join()
    end = time.time()
    #directory = "output_data/{}-{}".format(date_to_analyse, year)
    
    directory = "/media/manikya_varshney/Backup/Yale Data/output_data/{}-{}".format(date_to_analyse, year)
    
    create_directory(directory)
    data1.to_csv("{}/{}.csv".format(directory, filename))
    return data

In [19]:
def clean_all_data_and_save(data_, year):
    non_quote_non_rt, quote_non_rt, quote_rt, non_quote_rt = data_
    non_quote_non_rt = clean_data_and_save(non_quote_non_rt, 'non_quote_non_rt', year)
    quote_non_rt = clean_data_and_save(quote_non_rt, 'quote_non_rt', year)    
    non_quote_rt = clean_data_and_save(non_quote_rt, 'non_quote_rt', year)
    quote_rt = clean_data_and_save(quote_rt, 'quote_rt', year)
    return non_quote_non_rt, quote_non_rt, quote_rt, non_quote_rt

In [20]:
def combine_all_data(data_, year):
    non_quote_non_rt, quote_non_rt, quote_rt, non_quote_rt = data_
    combined = pd.concat([non_quote_non_rt, quote_non_rt, quote_rt, non_quote_rt])
    #directory = "output_data/{}-{}".format(date_to_analyse, year)
    
    directory = "/media/manikya_varshney/Backup/Yale Data/output_data/{}-{}".format(date_to_analyse, year)
#     directory = "output_data/{}-{}".format(date_to_analyse, year)
    
    create_directory(directory)
    combined.to_csv('{}/combined_{}.csv'.format(directory, date_to_analyse), index = False)

In [21]:
def final_clean(FILENAME, TEMP_FOLDER, FILES_NAMES, date_to_analyse, FOLDER_NAME, year):
    
    all_data = separate_df(FILENAME, TEMP_FOLDER, FILES_NAMES, date_to_analyse, FOLDER_NAME)
    non_quote_non_rt, quote_non_rt, quote_rt, non_quote_rt = clean_all_data_and_save(all_data, year)
    combine_all_data((non_quote_non_rt, quote_non_rt, quote_rt, non_quote_rt), year)

In [22]:
# Constants
#FILENAME = "h01-20200818-10files.zip"

FILENAME = "/media/manikya_varshney/Backup/Yale Data/h01-20200818-20200824.zip"

#TEMP_FOLDER = "tmp"

TEMP_FOLDER = "/media/manikya_varshney/Backup/Yale Data/tmp"

FOLDER_NAME = get_folder_name_from_zip(FILENAME)

FILES_NAMES = create_filename(FILENAME, TEMP_FOLDER)
date_to_analyse, year = strip_date(FILES_NAMES)

In [None]:
start_date, end_date = get_date_range('h01-20200818-20200824.zip')

start = time.time()

for single_date in daterange(start_date, end_date):
    file_name_start = 'h01-{}'.format(single_date.strftime("%Y%m%d"))
    date_to_analyse, year = strip_date(file_name_start)
    final_clean(FILENAME, TEMP_FOLDER, file_name_start, date_to_analyse, FOLDER_NAME, year)
    
    #print(date_to_analyse, year)
    #print(single_date.strftime("%Y-%m-%d"))
    
end = time.time()
print(end-start)

reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-153021.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-153426.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-153830.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-154234.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-154636.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-155117.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-155641.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-160151.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-160818.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20

20200819
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-214751.csv
20200819
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-215140.csv
20200819
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-215524.csv
20200819
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-220300.csv
20200819
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-220651.csv
20200819
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-221045.csv
20200819
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-221442.csv
20200819
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200818-221858.csv
20200819
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-202

reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-062532.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-062953.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-063411.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-063833.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-064252.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-064703.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-065115.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-065524.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-070329.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20

reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-115113.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-115517.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-115916.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-120306.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-121521.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-122808.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-124107.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-125433.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-130730.csv
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20

20200820
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-205801.csv
20200820
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-210603.csv
20200820
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-211412.csv
20200820
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-212212.csv
20200820
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-213013.csv
20200820
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-213806.csv
20200820
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-214600.csv
20200820
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-20200819-215353.csv
20200820
reading file /media/manikya_varshney/Backup/Yale Data/tmp/h01-20200818-20200824/h01-202

In [None]:
shutil.rmtree(TEMP_FOLDER)

In [None]:
'''
################ BEFORE USE ################
1. In Block [22] :- Change FILENAME and TEMP_FOLDER accordingly
2. In Block [9] :- Change directory accordingly
3. In Block [18] :- Change directory accordingly
4. In Block [20] :- Change directory accordingly

################ USAGE ################
1. In Block [22] :- Pass the location of zip file to be cleaned in FILENAME
2. In Block [23] :- Pass the name of zip file to be cleaned in get_date_range function
'''