#### Mahmoud Draidi محمود دريدي

In [1]:
# Import pandas library
import pandas as pd

In [2]:
# Load the dataframes
df_cases = pd.read_pickle('data/df_cases_200906.gzip')
df_labels = pd.read_pickle('data/df_label_200906.gzip')

In [3]:
# Filter out invalid contracts
valid_contracts = df_cases[(df_cases['IsExecuted'] == True) & (df_cases['QualityScore'] >= 0.81)]

In [4]:
# Merge with labels
final_df = pd.merge(valid_contracts, df_labels, on='CaseId', how='left')

In [5]:
# Group by CaseId and concatenate OcrText
final_df = final_df.groupby('CaseId').agg({'FileName': list, 'OcrText': ' '.join, 'label_1': 'first', 'label_2': 'first'}).reset_index()
final_df.rename(columns={'FileName': 'ValidFileNames'}, inplace=True)

In [6]:
# Get invalid filenames
invalid_contracts = df_cases[(df_cases['IsExecuted'] == False) | (df_cases['QualityScore'] < 0.81)]
invalid_filenames = invalid_contracts.groupby('CaseId')['FileName'].apply(list).reset_index()
invalid_filenames.rename(columns={'FileName': 'InvalidFileNames'}, inplace=True)

In [7]:
# Merge invalid filenames with final_df
final_df = pd.merge(final_df, invalid_filenames, on='CaseId', how='left')

In [8]:
# Fill NaN values in InvalidFileNames column
final_df['InvalidFileNames'].fillna(value='None', inplace=True)

In [9]:
# Rearrange columns
final_df = final_df[['CaseId', 'ValidFileNames', 'InvalidFileNames', 'OcrText', 'label_1', 'label_2']]

In [10]:
# Save final prepared dataset
final_df.to_pickle('submit/df_final.gzip')

In [11]:
# Print final Dataframe
final_df

Unnamed: 0,CaseId,ValidFileNames,InvalidFileNames,OcrText,label_1,label_2
0,003061189006,[003061189006_69176036_Order form_978-1-62414-...,,As difficult behavior her myself help.,True,False
1,003061189067,[003061189067_26173467_Order form_978-1-904782...,,Present can phone form.,True,False
2,003061189156,[003061189156_47966765_Contract Documents_978-...,,Section science difference success wish it wide.,False,False
3,003061189229,[003061189229_21094545_other documents_978-1-8...,[003061189229_69115288_Master contract_978-0-1...,Accept sell leader herself if.,False,False
4,003061189349,[003061189349_72196625_Other documents_978-0-6...,,Another later everybody large real.,False,False
...,...,...,...,...,...,...
565,003061230596,[003061230596_91328991_other documents_978-0-8...,[003061230596_30702523_other documents_978-0-6...,Miss style unit. High begin purpose interestin...,False,False
566,003061230613,[003061230613_55291460_other documents_978-1-1...,[003061230613_47613577_Terms & Conditions (all...,Really certainly might responsibility responsi...,False,False
567,003061230657,[003061230657_50860326_Order form_978-1-80159-...,,Cultural light carry past technology finish la...,True,False
568,003061230728,[003061230728_74076581_Amendments_978-0-14-763...,[003061230728_79408066_Master contract_978-0-1...,None attorney spend tend miss appear.,True,False


In [12]:
# Read Result file
Result = pd.read_pickle('submit/df_final.gzip')

In [None]:
# Print the result
# Notes:
# As I see here InvalidFileNames Field may have None value and that is true because there is InValid contracts for a CaseId
# OcrText contains all OcrTexts concatentated from all OcrText fields of all VALID contracts.
# ValidFileNames Field contains all file names of VALID contracts
# Thank You :)
Result

Unnamed: 0,CaseId,ValidFileNames,InvalidFileNames,OcrText,label_1,label_2
0,003061189006,[003061189006_69176036_Order form_978-1-62414-...,,As difficult behavior her myself help.,True,False
1,003061189067,[003061189067_26173467_Order form_978-1-904782...,,Present can phone form.,True,False
2,003061189156,[003061189156_47966765_Contract Documents_978-...,,Section science difference success wish it wide.,False,False
3,003061189229,[003061189229_21094545_other documents_978-1-8...,[003061189229_69115288_Master contract_978-0-1...,Accept sell leader herself if.,False,False
4,003061189349,[003061189349_72196625_Other documents_978-0-6...,,Another later everybody large real.,False,False
...,...,...,...,...,...,...
565,003061230596,[003061230596_91328991_other documents_978-0-8...,[003061230596_30702523_other documents_978-0-6...,Miss style unit. High begin purpose interestin...,False,False
566,003061230613,[003061230613_55291460_other documents_978-1-1...,[003061230613_47613577_Terms & Conditions (all...,Really certainly might responsibility responsi...,False,False
567,003061230657,[003061230657_50860326_Order form_978-1-80159-...,,Cultural light carry past technology finish la...,True,False
568,003061230728,[003061230728_74076581_Amendments_978-0-14-763...,[003061230728_79408066_Master contract_978-0-1...,None attorney spend tend miss appear.,True,False
