⬇ Import Libraries:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import doctest

🔄 Related Functions:

In [2]:
def Extract_Key_From_Filename(File_Name):
    """
    The Purpose of this function is:
    Extracting a Proper key from file name 
    for DataFrame dictionary
    Input: File_Name:String
    Output: Key_Name:String
    >>> Extract_Key_From_Filename("02b-breast-cancer-wisconsin.data")
    'breast-cancer-wisconsin'
    """
    start=File_Name.index("-")
    stop=File_Name.index(".")
    Key_Name=File_Name[start+1:stop]
    return Key_Name



def Extract_Column_Name_From_File(File_Name_Address):
    """
    The purpose of this function is:
    Extracting columns names from .name file
    for desired DataFrame
    input:File Address:String
    output:Columns name:list
    """
    
    
    def Cleaning_Column_lines(Column_line):
            """
            We use this function for finding column names 
            in different format
            this will be used with map 
            Input:Column line : String
            Output:Column name: String
            >>> Cleaning_Column_lines("1. Sample code number            id number")
            'Sample code number'
            >>> Cleaning_Column_lines("1. sepal length in cm")
            'sepal length in cm'
            >>> Cleaning_Column_lines("1.  Sequence Name: Accession number for the SWISS-PROT database")
            'Sequence Name'
            """
            start=Column_line.index(".")
            list_char_to_check=[":","  "]
            stop=0
            for char in list_char_to_check:
                try:
                    position=Column_line.index(char)

                except:
                    pass

                else:
                    stop=position
                    break

            if stop==0:
                stop=len(Column_line)
            return(str.strip(Column_line[start+1:stop]))
                
            
        
    File=open(File_Name_Address,mode='r')
    text=File.read()
    text=text.lower()
    Columns_section=text[text.index("7. attribute information"):text.index("8. missing attribute values")]
    pattern=r"\d[.]\s+.*"
    Columns_list=re.findall(pattern,Columns_section)
    
    Columns_list=list(map(Cleaning_Column_lines,Columns_list))
    return(Columns_list[1:])

In [3]:
Extract_Column_Name_From_File('Raw Data\\02a-breast-cancer-wisconsin.names')

['sample code number',
 'clump thickness',
 'uniformity of cell size',
 'uniformity of cell shape',
 'marginal adhesion',
 'single epithelial cell size',
 'bare nuclei',
 'bland chromatin',
 'normal nucleoli',
 'mitoses',
 'class']

🎯 Functions Test:

In [4]:
doctest.testmod(verbose=True)

Trying:
    Extract_Key_From_Filename("02b-breast-cancer-wisconsin.data")
Expecting:
    'breast-cancer-wisconsin'
ok
2 items had no tests:
    __main__
    __main__.Extract_Column_Name_From_File
1 items passed all tests:
   1 tests in __main__.Extract_Key_From_Filename
1 tests in 3 items.
1 passed and 0 failed.
Test passed.


TestResults(failed=0, attempted=1)

⬇ Reading Data: 

In [5]:
wd=os.getcwd()+"\\Data"
Files_Addresses=os.listdir(wd)
DataFrames_Dictionary={}
for File in list(filter(lambda X:X.endswith(".data"),Files_Addresses)):
    key=Extract_Key_From_Filename(File)
    names_file=list(filter(lambda X:X.endswith(key+".names"),Files_Addresses))
    print(key,names_file)
    Column_list=Extract_Column_Name_From_File("Data\\"+names_file[0])
    DataFrames_Dictionary[key]=pd.read_csv("Data\\"+File,names=Column_list,header=None)

breast-cancer-wisconsin ['02a-breast-cancer-wisconsin.names']
ecoli ['03b-ecoli.names']
glass ['04b-glass.names']
haberman ['05b-haberman.names']
iris ['06b-iris.names']


👩‍🔬 DataFrames Analyse

In [6]:
DataFrame_index=1
DataFrames_Names=list(DataFrames_Dictionary.keys())
df=DataFrames_Dictionary.get(DataFrames_Names[DataFrame_index])
df.head()

Unnamed: 0,sequence name,mcg,gvh,lip,chg,aac,alm1,alm2
0,AAT_ECOLI0.490.290.480.50.560.240.35cp,,,,,,,
1,ACEA_ECOLI0.070.40.480.50.540.350.44cp,,,,,,,
2,ACEK_ECOLI0.560.40.480.50.490.370.46cp,,,,,,,
3,ACKA_ECOLI0.590.490.480.50.520.450.36cp,,,,,,,
4,ADI_ECOLI0.230.320.480.50.550.250.35cp,,,,,,,


In [26]:
file=open('Data\\03a-ecoli.data',mode='r+')
text=file.read()

reaesc = re.compile(r'\x1b')
new_text = reaesc.sub(',', text)
file.write(new_text)
print(new_text)
file.close()


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [22]:
file=open('Data\\Test.data',mode='r')
text=file.read()
text

''