In [None]:
from pandas import DataFrame
from datetime import datetime

class MVTSSample:
    
    def __init__(self, flare_type:str, start_time:datetime, end_time:datetime, data:DataFrame):
        self._flare_type = flare_type
        self._start_time = start_time
        self._end_time = end_time
        self._data = data
    
    def get_flare_type(self):
        return self._flare_type
    
    def get_start_time(self):
        return self._start_time
    
    def get_end_time(self):
        return self._end_time
    
    def get_data(self):
        return self._data

In [None]:
import pandas as pd
import re 

def read_non_flare_mvts(data_dir:str, file_name:str) -> MVTSSample:
    match  = re.findall(r'(.*)_s(.*)_e(.*)',file_name[:-4])
    s_t = match[0][1]
    #print(s_t)
    start_time = datetime.strptime(s_t.replace('T',' ').replace('_',':'), '%Y-%m-%d %H:%M:%S')
    e_t = match[0][2]
    #print(e_t)
    #print(match)
    end_time =  datetime.strptime(e_t.replace('T',' ').replace('_',':'), '%Y-%m-%d %H:%M:%S')
    if(match[0][0][0] == 'F'):
        flare_type = match[0][0][0:2]
    else:
        flare_type = match[0][0][0]
    
    path = data_dir+"\\"+ file_name
    data = pd.read_csv(path,delimiter='\t')
    data_frame = pd.DataFrame(data)
    
    return MVTSSample(flare_type,start_time,end_time,data_frame)

In [None]:
def read_flare_mvts(data_dir:str, file_name:str) -> MVTSSample:
    match  = re.findall(r'(.*)_s(.*)_e(.*)',file_name[:-4])
    s_t = match[0][1]
    start_time = datetime.strptime(s_t.replace('T',' ').replace('_',':'), '%Y-%m-%d %H:%M:%S')
    e_t = match[0][2]
    end_time =  datetime.strptime(e_t.replace('T',' ').replace('_',':'), '%Y-%m-%d %H:%M:%S')
    if(match[0][0][0] == 'F'):
        flare_type = match[0][0][0:2]
    else:
        flare_type = match[0][0][0]
    
    path = data_dir+"\\"+ file_name
    data = pd.read_csv(path,delimiter='\t')
    data_frame = pd.DataFrame(data)
    
    return MVTSSample(flare_type,start_time,end_time,data_frame)

In [None]:
data_dir = r'C:\Users\Krishna Rukmini\Downloads\partition1_instances\partition1_instances (1).tar\partition1\NF'
file_name = "B1.0@13_Primary_ar10_s2010-05-02T23_12_00_e2010-05-03T11_00_00.csv"
results = read_non_flare_mvts(data_dir, file_name)
results.get_data()

In [None]:
df = results.get_data()

In [None]:
for val in df.columns:
    if df[val].nunique() > 10 :
        print(val)

In [None]:
import numpy as np

def calculate_descriptive_features(data:DataFrame)-> DataFrame:
    variates_to_calc_on = [ 'R_VALUE','TOTUSJH','TOTBSQ','TOTPOT','TOTUSJZ','ABSNJZH','SAVNCPP',
                           'USFLUX','TOTFZ','MEANPOT','EPSZ','MEANSHR','SHRGT45','MEANGAM','MEANGBT',
                           'MEANGBZ','MEANGBH','MEANJZH','TOTFY','MEANJZD','MEANALP','TOTFX']

    result = []
    total = []
    features_to_return =[]
    features=[]

    for values in variates_to_calc_on:
        result.append(np.float64(np.median(data[values])))
        result.append(np.float64(np.mean(data[values])))
        result.append(np.float64(np.std(data[values])))
        result.append(np.float64(np.min(data[values])))
        result.append(np.float64(np.max(data[values])))
        result.append(np.float64(np.var(data[values])))
        
        features.append(values+"_MEDIAN")
        features.append(values+"_MEAN")
        features.append(values+"_STDDEV")
        features.append(values+"_MIN")
        features.append(values+"_MAX")
        features.append(values+"_vARIANCE")
        
    total.append(result)
    features_to_return.append(features)

    #print(features_to_return)
    
    Result_df = pd.DataFrame(total,columns=features_to_return[0])
    
    return Result_df,features_to_return

In [None]:
data,features = calculate_descriptive_features(results.get_data())
data

In [None]:
features_to_return = features[0]
features_to_return

In [None]:
def process_partition(partition_location:str, abt_name:str):
    Descriptive_frame= pd.DataFrame([],columns = features_to_return )
    flag = 0
    count_FL = 0
    count_NF = 0
    dir_Name = partition_location;
    listOfFiles = List_Of_Files(dir_Name)
    print(listOfFiles)
    for i in listOfFiles[1:]:
        
        for ele in os.listdir(i):
            if(flag == 0):
                data_result = read_flare_mvts(dir_Name+'\FL',ele)
                #print(count_FL)
                #count_FL = count_FL+1
            else:
                data_result = read_non_flare_mvts(dir_Name+'\\NF',ele)
                #print(count_NF)
                #count_NF = count_NF +1
            Descriptive_features, f = calculate_descriptive_features(data_result.get_data())
            Descriptive_features.insert(0,"FLARE_TYPE",data_result.get_flare_type())
            Descriptive_frame = Descriptive_frame.append(Descriptive_features)
        flag = flag+1
        Descriptive_frame.to_csv(abt_name,index = False,header = True)
    return Descriptive_frame

In [None]:
import os

def List_Of_Files(dir_Name): 
    listOfFile = [x[0]  for x in os.walk(dir_Name)]
    return listOfFile     


In [None]:
result = process_partition(r"C:\Users\Krishna Rukmini\Downloads\partition1_instances\partition1_instances (1).tar\partition1",r"C:\Users\Krishna Rukmini\Downloads\partition1\all_names.csv")
print(result)   

In [None]:
import pandas as pd

data = pd.read_csv(r"C:\Users\Krishna Rukmini\Downloads\partition1\all_names.csv")

In [None]:
def calc_summary_for(feature_name:str, data):
    summary_feature_names = ['Feature Name', 'Cardinality', 'Non-null Count', 'Null Count', 'Min', '25th', 'Mean', 
                             '50th', '75th', 'Max', 'std. Dev','Outlier Count Low', 'Outlier Count High']
    
    result = []
    data.sort_values(by = feature_name, inplace = True)
    
    result.append(feature_name) #'Feature Name'
    result.append(data[feature_name].nunique()) # 'Cardinality'
    result.append(data[feature_name].count()) #'Non-null Count'
    result.append(data[feature_name].isnull().sum()) #'Null Count'
    result.append(data[feature_name].min()) #'Min'
    
    Q1 = np.percentile(data[feature_name],25)
    result.append(Q1) #'25th'
    
    Q2 = np.percentile(data[feature_name],50)
    result.append(Q2) #'50th'
    
    Q3 = np.percentile(data[feature_name],75)
    result.append(Q3) #'75th'
    
    result.append(data[feature_name].max()) #'Max'
    result.append(data[feature_name].mean()) #'Mean'
    result.append(data[feature_name].std()) #'std. Dev'
    
    IQR = Q3 - Q1
    
    for val in data[feature_name]:
        #print(val)
        Outlier_Count_Low = 0
        Outlier_Count_High = 0
        if (val < Q1-(1.5*IQR)):
            Outlier_Count_Low = Outlier_Count_Low + 1
        elif (val > Q3+(1.5*IQR)):
            Outlier_Count_High =Outlier_Count_High + 1
    
    result.append(Outlier_Count_Low)
    result.append(Outlier_Count_High)
    
    
    Result_df = pd.DataFrame([result],columns=summary_feature_names)
    
    return Result_df
    

In [None]:
def construct_quality_report(data):
    excluded_columns = ['FLARE_TYPE']
    
    summary_feature_names = ['Feature Name', 'Cardinality', 'Non-null Count', 'Null Count', 'Min', '25th', 'Mean', 
                             '50th', '75th', 'Max','std. Dev', 'Outlier Count Low', 'Outlier Count High']
    
    Result_summary = pd.DataFrame(columns=summary_feature_names)
    summary_table_df = pd.DataFrame(data.drop(columns= excluded_columns ,inplace= False))
    
    list_col = list(summary_table_df.columns) 
    for val in list_col:
        Result_summary = pd.concat([Result_summary,calc_summary_for(val, summary_table_df)],ignore_index=True)
    
    #Result_summary = Result_summary.reindex(index=[i for i in range(Result_summary.shape[0])])
    return Result_summary
    

In [None]:
import numpy as np

summary_table_full_partition = construct_quality_report(data)
def drop_low_card_data(summary_table, data) -> None:
    index_del = summary_table.loc[summary_table['Cardinality']<10].index
    new_list = []
    for ind in index_del:
        new_list.append(summary_table["Feature Name"][ind])
    summary_table.drop(index_del,inplace= True)        
    data.drop(new_list,axis = 1,inplace = True)

drop_low_card_data(summary_table_full_partition, data)

In [None]:
def drop_excessive_nan_data(summary_table, data) -> None:
    col_null_del=[]
    for val in data.columns:
        if (data[val].isnull().sum() > len(data)/100):
            col_null_del.append(val)
    
    data.drop(col_null_del,axis = 1,inplace = True)
    
    index_null_drp = []
    for val in col_null_del:
        index_null_drp.append(summary_table.loc[summary_table["Feature Name"]==val].index)
    
    for i in index_null_drp:
        summary_table.drop(i[0],inplace = True,axis = 0)

drop_excessive_nan_data(summary_table_full_partition, data)

In [None]:
# Save test data

out_dir = "C:/Users/Krishna Rukmini/Downloads"
out_summary_table_name = 'data_summary_table.csv'
out_data = 'cleaned_partition1ExtractedFeatures.csv'

summary_table_full_partition.to_csv(out_summary_table_name,header=True,index=False)
data.to_csv(out_data,header=True,index=False)
