1.data cleaning

FINAL DATA CLEANING CODE THAT CLEANS CSV,XLSX,xml
 AND JSON FILE


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
import xml.etree.ElementTree as ET

def clean_data(file_path):
    file_extension = file_path.split('.')[-1].lower()

    if file_extension == 'csv':
        df = pd.read_csv(file_path)
    elif file_extension in ['xls', 'xlsx']:
        df = pd.read_excel(file_path)
    elif file_extension == 'json':
        df = pd.read_json(file_path)
    elif file_extension == 'xml':
        df = xml_to_dataframe(file_path)
    else:
        raise ValueError("Unsupported file format. Supported formats: CSV, Excel (XLS/XLSX), JSON, XML")

    if df is None:
        raise ValueError("Failed to load data from file.")

    print("Missing values before imputation:")
    print(df.isnull().sum())

    df = df.drop_duplicates()

    # Convert numeric columns to float
    numeric_cols = df.select_dtypes(include=np.number).columns
    df[numeric_cols] = df[numeric_cols].astype(float)

    # Convert categorical columns to string
    categorical_cols = df.select_dtypes(exclude=np.number).columns
    df[categorical_cols] = df[categorical_cols].astype(str)

    # Example: Removing leading and trailing whitespaces from string columns
    df[categorical_cols] = df[categorical_cols].apply(lambda x: x.str.strip())

    # Outlier Detection and Removal using RobustScaler
    if not numeric_cols.empty:
        robust_scaler = RobustScaler()
        df[numeric_cols] = robust_scaler.fit_transform(df[numeric_cols])

    print("Missing values after imputation:")
    print(df.isnull().sum())

    # Save cleaned data to CSV
    output_file = file_path.split('/')[-1].split('.')[0] + '_cleaned.csv'
    df.to_csv(output_file, index=False)

    return output_file

def xml_to_dataframe(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    data = []
    for element in root:
        item = {}
        for child in element:
            item[child.tag] = child.text
        data.append(item)

    return pd.DataFrame(data)

# Example usage:
input_file_path = '/content/symbols_valid_meta.csv'  # Change file extension as needed
output_file = clean_data(input_file_path)
print(f"Cleaned data saved to: {output_file}")


Missing values before imputation:
Nasdaq Traded          0
Symbol                 0
Security Name          0
Listing Exchange       0
Market Category        0
ETF                    0
Round Lot Size         0
Test Issue             0
Financial Status    4666
CQS Symbol          3383
NASDAQ Symbol          0
NextShares             0
dtype: int64
Missing values after imputation:
Nasdaq Traded       0
Symbol              0
Security Name       0
Listing Exchange    0
Market Category     0
ETF                 0
Round Lot Size      0
Test Issue          0
Financial Status    0
CQS Symbol          0
NASDAQ Symbol       0
NextShares          0
dtype: int64
Cleaned data saved to: symbols_valid_meta_cleaned.csv


cleaning without xml format

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler

def clean_data(file_path):
    file_extension = file_path.split('.')[-1].lower()

    if file_extension == 'csv':
        df = pd.read_csv(file_path)
    elif file_extension in ['xls', 'xlsx']:
        df = pd.read_excel(file_path)
    elif file_extension == 'json':
        df = pd.read_json(file_path)
    else:
        raise ValueError("Unsupported file format. Supported formats: CSV, Excel (XLS/XLSX), JSON")

    if df is None:
        raise ValueError("Failed to load data from file.")

    print("Missing values before imputation:")
    print(df.isnull().sum())

    df = df.drop_duplicates()

    # Convert numeric columns to float
    numeric_cols = df.select_dtypes(include=np.number).columns
    df[numeric_cols] = df[numeric_cols].astype(float)

    # Convert categorical columns to string
    categorical_cols = df.select_dtypes(exclude=np.number).columns
    df[categorical_cols] = df[categorical_cols].astype(str)

    # Example: Removing leading and trailing whitespaces from string columns
    df[categorical_cols] = df[categorical_cols].apply(lambda x: x.str.strip())

    # Outlier Detection and Removal using RobustScaler
    if not numeric_cols.empty:
        robust_scaler = RobustScaler()
        df[numeric_cols] = robust_scaler.fit_transform(df[numeric_cols])

    print("Missing values after imputation:")
    print(df.isnull().sum())

    # Save cleaned data to CSV
    output_file = file_path.split('/')[-1].split('.')[0] + '_cleaned.csv'
    df.to_csv(output_file, index=False)

    return output_file

# Example usage:
input_file_path = '/content/palestinian_books.xml'  # Change file extension as needed
output_file = clean_data(input_file_path)
print(f"Cleaned data saved to: {output_file}")


2.FILE CONVERSION


FINAL FILE CONV CODE


In [None]:
import pandas as pd
import json
import csv
import os
import xml.etree.ElementTree as ET

def convert_to_json(input_file):
    file_name, _ = os.path.splitext(input_file)
    output_json_file = file_name + '.json'

    file_extension = os.path.splitext(input_file)[1].lower()

    if file_extension == '.csv':
        data = pd.read_csv(input_file, encoding='utf-8')
    elif file_extension in ('.xlsx', '.xls'):
        data = pd.read_excel(input_file)
    elif file_extension == '.xml':
        data = xml_to_json(input_file)
    elif file_extension == '.txt':
        data = fixed_width_to_json(input_file)
    else:
        raise ValueError("Unsupported file format")

    serializable_data = convert_data_to_serializable(data)

    with open(output_json_file, 'w') as json_output:
        json.dump(serializable_data, json_output, indent=4)

    return output_json_file

def xlsx_to_json(xlsx_file):
    data = pd.read_excel(xlsx_file)
    data_list = data.to_dict(orient='records')
    return data_list

def xml_to_json(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    data = []
    for element in root:
        item = {}
        for child in element:
            item[child.tag] = child.text
        data.append(item)
    return data

def fixed_width_to_json(txt_file):
    with open(txt_file, 'r') as file:
        lines = file.readlines()

    data = []
    for line in lines:
        item = {
            "Field1": line[0:10],
            "Field2": line[10:20],
            # Define fields and positions as needed
        }
        data.append(item)

    return data

def convert_data_to_serializable(data):
    if isinstance(data, pd.DataFrame):
        return data.to_dict(orient='records')
    else:
        return data

def json_to_xml(input_json):
    file_name, _ = os.path.splitext(input_json)
    output_xml_file = file_name + '.xml'

    with open(input_json, 'r') as json_file:
        data = json.load(json_file)

    root = ET.Element('Data')

    if isinstance(data, list):
        for entry in data:
            item = ET.SubElement(root, 'Item')
            for key, value in entry.items():
                sub_element = ET.SubElement(item, key)
                sub_element.text = str(value)
    elif isinstance(data, dict):
        item = ET.SubElement(root, 'Item')
        for key, value in data.items():
            sub_element = ET.SubElement(item, key)
            sub_element.text = str(value)
    else:
        raise ValueError("Unsupported JSON format")

    tree = ET.ElementTree(root)
    tree.write(output_xml_file)

    return output_xml_file

def json_to_csv(input_json):
    file_name, _ = os.path.splitext(input_json)
    output_csv_file = file_name + '.csv'

    with open(input_json, 'r') as json_file:
        data = json.load(json_file)

    with open(output_csv_file, 'w', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

    return output_csv_file

def json_to_xlsx(input_json):
    file_name, _ = os.path.splitext(input_json)
    output_xlsx_file = file_name + '.xlsx'

    with open(input_json, 'r') as json_file:
        data = json.load(json_file)

    df = pd.DataFrame(data)
    df.to_excel(output_xlsx_file, index=False)

    return output_xlsx_file

def json_to_text(input_json):
    file_name, _ = os.path.splitext(input_json)
    output_text_file = file_name + '.txt'

    with open(input_json, 'r') as json_file:
        data = json.load(json_file)

    with open(output_text_file, 'w') as text_file:
        for entry in data:
            text_file.write(json.dumps(entry) + '\n')

    return output_text_file

def main():
    input_file = input("Enter the path to the input file: ")
    output_format = input("Enter the output format (xml/csv/xlsx/json): ")

    if output_format == 'json':
        output_file = convert_to_json(input_file)
        print(f"{input_file} has been converted to JSON and saved as {output_file}")
    elif output_format == 'xml':
        output_file = json_to_xml(input_file)
        print(f"{input_file} has been converted to XML and saved as {output_file}")
    elif output_format == 'csv':
        output_file = json_to_csv(input_file)
        print(f"{input_file} has been converted to CSV and saved as {output_file}")
    elif output_format == 'xlsx':
        output_file = json_to_xlsx(input_file)
        print(f"{input_file} has been converted to XLSX and saved as {output_file}")
    elif output_format == 'text':
        output_file = json_to_text(input_file)
        print(f"{input_file} has been converted to Text and saved as {output_file}")
    else:
        print("Unsupported output format.")

if __name__ == "__main__":
    main()


Enter the path to the input file: /content/palestinian_books.json
Enter the output format (xml/csv/xlsx/json): xml
/content/palestinian_books.json has been converted to XML and saved as /content/palestinian_books.xml


EXHIBIT A


In [None]:
import json
import csv
import os
import pandas as pd
import xml.etree.ElementTree as ET

def convert_to_json(input_file, output_json_file):
    file_extension = os.path.splitext(input_file)[1].lower()

    if file_extension == '.csv':
        data = pd.read_csv(input_file, encoding='utf-8')
    elif file_extension in ('.xlsx', '.xls'):
        data = xlsx_to_json(input_file)
    elif file_extension == '.xml':
        data = xml_to_json(input_file)
    elif file_extension == '.txt':
        data = fixed_width_to_json(input_file)
    else:
        raise ValueError("Unsupported file format")

    # Convert the data to a format that is JSON-serializable
    serializable_data = convert_data_to_serializable(data)

    with open(output_json_file, 'w') as json_output:
        json.dump(serializable_data, json_output, indent=4)

    return output_json_file  # Return the path to the converted JSON file

def xlsx_to_json(xlsx_file):
    data = pd.read_excel(xlsx_file)
    data_list = data.to_dict(orient='records')
    return data_list

def xml_to_json(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    data = []
    for element in root:
        item = {}
        for child in element:
            item[child.tag] = child.text
        data.append(item)
    return data

def fixed_width_to_json(txt_file):
    with open(txt_file, 'r') as file:
        lines = file.readlines()

    data = []
    for line in lines:
        item = {
            "Field1": line[0:10],
            "Field2": line[10:20],
            # Define fields and positions as needed
        }
        data.append(item)

    return data

def convert_data_to_serializable(data):
    # Convert Pandas DataFrame to a format that is JSON-serializable
    if isinstance(data, pd.DataFrame):
        return data.to_dict(orient='records')
    else:
        return data

def datatocompany(input_file):
    if not os.path.isfile(input_file):
        print(f"Input file '{input_file}' does not exist.")
        return

    file_name, file_extension = os.path.splitext(input_file)
    output_json_file = file_name + '.json'

    cvted_file = None  # Declare cvt_file locally

    try:
        cvted_file = convert_to_json(input_file, output_json_file)  # Assign the converted JSON file to cvt_file
        print(f"{input_file} has been converted to JSON and saved as {output_json_file}")
    except Exception as e:
        print(f"Conversion failed: {str(e)}")

    return cvted_file  # Return the path to the converted JSON file

def json_to_xml(input_json, output_xml):
    with open(input_json, 'r') as json_file:
        data = json.load(json_file)

    root = ET.Element('Data')

    # If the JSON data is a list of dictionaries (records)
    if isinstance(data, list):
        for entry in data:
            item = ET.SubElement(root, 'Item')
            for key, value in entry.items():
                sub_element = ET.SubElement(item, key)
                sub_element.text = str(value)

    # If the JSON data is a dictionary (single record)
    elif isinstance(data, dict):
        item = ET.SubElement(root, 'Item')
        for key, value in data.items():
            sub_element = ET.SubElement(item, key)
            sub_element.text = str(value)

    else:
        raise ValueError("Unsupported JSON format")

    tree = ET.ElementTree(root)
    tree.write(output_xml)

def json_to_requiredformat(input_json, output_format):
    if output_format == 'xml':
        output_file = 'outputdata.xml'
        json_to_xml(input_json, output_file)
    elif output_format == 'csv':
        output_file = 'outputdata.csv'
        json_to_csv(input_json, output_file)
    elif output_format == 'xlsx':
        output_file = 'outputdata.xlsx'
        json_to_xlsx(input_json, output_file)
    elif output_format == 'text':
        output_file = 'outputdata.txt'
        json_to_text(input_json, output_file)
    else:
        print("Unsupported output format.")

    print(f"{input_json} has been converted to {output_format} and saved as {output_file}.")

if __name__ == "__main__":
    input_file = input("Enter the path to the input file: ")
    output_format = input("Enter the output format (xml/csv/xlsx/json): ")
    output_file = "outputdata." + output_format

    if output_format == 'json':
        cvted_file = datatocompany(input_file)
        if cvted_file is not None:
            print(f"Converted JSON file saved as: {cvted_file}")
    else:
        json_to_requiredformat(input_file, output_format)


Enter the path to the input file: /content/outputdata.xml
Enter the output format (xml/csv/xlsx/json): json
/content/outputdata.xml has been converted to JSON and saved as /content/outputdata.json
Converted JSON file saved as: /content/outputdata.json


3.IMAGE to file


In [None]:
!apt-get install -y tesseract-ocr
!pip install pytesseract

In [None]:
import cv2
import pytesseract
import json
import xml.etree.ElementTree as ET

def extract_text_from_image(image_path, output_format='txt'):
    img = cv2.imread(image_path)
    text = pytesseract.image_to_string(img)
    output_file_name = image_path.split('.')[0] + '_extracted_text.' + output_format

    if output_format == 'txt':
        with open(output_file_name, 'w') as f:
            f.write(text)
    elif output_format == 'csv':
        with open(output_file_name, 'w') as f:
            f.write('"{}"'.format(text.replace('"', '""')))
    elif output_format == 'xlsx':
        import pandas as pd
        df = pd.DataFrame({'Text': [text]})
        df.to_excel(output_file_name, index=False)
    elif output_format == 'json':
        with open(output_file_name, 'w') as f:
            json.dump({'text': text}, f, indent=4)
    elif output_format == 'xml':
        root = ET.Element('Text')
        text_elem = ET.SubElement(root, 'Content')
        text_elem.text = text
        tree = ET.ElementTree(root)
        tree.write(output_file_name)
    else:
        print("Unsupported output format.")
        return

    print(f"Extracted text saved to: {output_file_name}")

input_image_path = '/content/imgtext sample.png'
output_format = input("Enter the output format (txt/csv/xlsx/json/xml): ").lower()
extract_text_from_image(input_image_path, output_format)


Enter the output format (txt/csv/xlsx/json/xml): json
Extracted text saved to: /content/imgtext sample_extracted_text.json
