# 1. Reading and Analysing


In [1]:
import pyodbc
import pandas as pd

server = 'DELL\\SQLEXPRESS'
database = 'NEHADB'

connection_string = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={database};Trusted_Connection=yes;"
conn = pyodbc.connect(connection_string)

query = "SELECT * FROM dbo.APXENRICHED_FUND"
cursor = conn.cursor()
cursor.execute(query)

rows = cursor.fetchall()
columns = [column[0] for column in cursor.description]

df = pd.DataFrame.from_records(rows, columns=columns)

print(df.head())

# Analyze the data
print("Data Overview:\n", df.head())
print("\nStatistics:\n", df.describe(include='all'))


  ACCT_CD ACCT_NAME ACCT_SHT_NAME ACCT_TYP_CD PARENT_CHILD_FLAG CRRNCY_CD  \
0    006x      None          None        None              None      None   
1    007x      None          None        None              None      None   
2    008x      None          None        None              None      None   
3    015x      None          None        None              None      None   
4    020x      None          None        None              None      None   

  STATE_CD CNTRY_CD MKT_VAL NET_ASSETS  ... FACTOR_MODEL_HIERARCHY_CD  \
0     None     None    None       None  ...                      None   
1     None     None    None       None  ...                      None   
2     None     None    None       None  ...                      None   
3     None     None    None       None  ...                      None   
4     None     None    None       None  ...                      None   

  BLOCK_MONITOR_TYPE_CD TARGET_CASH_PCT LAST_MDL_CHANGE_PROCESS_DATE  \
0                  None   

# 2. Data type Suggestion


In [2]:
dtype_mapping = {
    'int64': 'INT',
    'float64': 'FLOAT',
    'object': 'VARCHAR(MAX)', 
    'datetime64[ns]': 'DATETIME',
    'bool': 'BIT'
}

suggested_dtypes = {col: dtype_mapping[str(df[col].dtype)] for col in df.columns}

print("Suggested Data Types for SQL Server:")
print(suggested_dtypes)


Suggested Data Types for SQL Server:
{'ACCT_CD': 'VARCHAR(MAX)', 'ACCT_NAME': 'VARCHAR(MAX)', 'ACCT_SHT_NAME': 'VARCHAR(MAX)', 'ACCT_TYP_CD': 'VARCHAR(MAX)', 'PARENT_CHILD_FLAG': 'VARCHAR(MAX)', 'CRRNCY_CD': 'VARCHAR(MAX)', 'STATE_CD': 'VARCHAR(MAX)', 'CNTRY_CD': 'VARCHAR(MAX)', 'MKT_VAL': 'VARCHAR(MAX)', 'NET_ASSETS': 'VARCHAR(MAX)', 'TOT_ASSETS': 'VARCHAR(MAX)', 'FUND_SHRS_OUTST': 'VARCHAR(MAX)', 'TOT_COST': 'VARCHAR(MAX)', 'NET_CASH': 'VARCHAR(MAX)', 'TOT_INVSTMNTS': 'VARCHAR(MAX)', 'NET_FUNDS_AVAIL': 'VARCHAR(MAX)', 'LIABILITIES': 'VARCHAR(MAX)', 'AMRTZD_COST': 'VARCHAR(MAX)', 'AVG_COST': 'VARCHAR(MAX)', 'OTH_ASSET': 'VARCHAR(MAX)', 'DIV_RECEIVED': 'VARCHAR(MAX)', 'INT_RECEIVED': 'VARCHAR(MAX)', 'CNTRBS': 'VARCHAR(MAX)', 'PMNTS': 'VARCHAR(MAX)', 'TRANSFERS': 'VARCHAR(MAX)', 'CASH_BAL_SOD': 'VARCHAR(MAX)', 'RECVB_SEC_SOLD': 'VARCHAR(MAX)', 'PAYBL_SEC_PURCH': 'VARCHAR(MAX)', 'RECVB_FUND_SHRS_SOLD': 'VARCHAR(MAX)', 'PAYBL_FUND_SHRS_LIQD': 'VARCHAR(MAX)', 'DIV_RCVBLE': 'VARCHAR(MAX)', 

# 3. Dropping NULL Columns 


In [3]:
import pandas as pd

columns_to_keep = []

for column in df.columns:
    non_null_values = df[column].dropna().tolist()
    if non_null_values:  
        columns_to_keep.append(column)
        print(f"{column}: {non_null_values}")

df_cleaned = df[columns_to_keep]


ACCT_CD: ['006x', '007x', '008x', '015x', '020x', '028x', '029x', '031x', '037x', '039x', '040x', '041x', '042x', '045x', '046x', '64', '074x', '076x', '077x', '085x', '10001', '10002', '10003', '10004', '10005', '10006', '10007', '10008', '10009', '1000x', '10010', '10011', '10012', '10013', '1001x', '1002', '10021x', '10022', '10023', '10024x', '10025', '10026', '10027', '10028', '10029', '10030x', '10031x', '10032', '10033', '10034', '10035', '10036', '10037', '10038', '10039', '1003x', '10041', '10042', '10043', '10044', '10045', '10046', '10047', '10048', '10049', '1004x', '1005', '10050', '100501x', '10051', '10052', '10053', '10054', '10055', '10057', '10058', '10059', '10060', '10061', '10062', '10063', '10064', '10065', '10066', '10067', '10068', '10069', '1006x', '1007', '10070x', '10071', '10072', '10073', '10074', '10075', '10076', '10077', '10078', '10079', '10080x', '10081x', '10082', '10083', '10084', '10085', '10086', '10087', '10088', '10089', '1008x', '10090', '10091'

In [4]:
df_cleaned

Unnamed: 0,ACCT_CD,MANAGER,INACTIVE,AUTH_GROUP,AUTH_NUM,UDF_CHAR13
0,006x,TM_DEV,True,,,ca
1,007x,IM_AA_PM,True,IM_TRADE_AUTH,1.0,ca
2,008x,TM_DEV,True,,,ca
3,015x,TM_DEV,True,,,ca
4,020x,TM_DEV,True,,,ca
...,...,...,...,...,...,...
7640,x0145x,TM_DEV,True,,,ca
7641,x014x,TM_DEV,True,,,ca
7642,x10,IM_AA_PM,False,IM_TRADE_AUTH,1.0,ca
7643,x168x,TM_DEV,True,,,ca


# 4. Finding Duplicates and dropping them

In [9]:
import pandas as pd

# Assuming 'df' is your DataFrame
if 'ACCT_CD' in df.columns:
    print("\nAnalyzing 'ACCT_CD' column for duplicates...")

    # Find duplicates
    duplicate_mask = df['ACCT_CD'].duplicated(keep=False)
    duplicate_count = df['ACCT_CD'].duplicated().sum()
    print(f"Number of duplicate entries in 'ACCT_CD': {duplicate_count}")

    if duplicate_count > 0:
        # Get duplicate values only from the 'ACCT_CD' column
        duplicate_values = df.loc[duplicate_mask, 'ACCT_CD'].drop_duplicates().reset_index(drop=True)
        print("\nDuplicate values in 'ACCT_CD':")
        print(duplicate_values)

        # Save only the duplicate 'ACCT_CD' values to an Excel file
        output_file = r"C:\Users\Neha\Desktop\Output.xlsx"
        with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
            # Save only the duplicate 'ACCT_CD' values
            duplicate_values.to_excel(writer, sheet_name='Duplicates', index=False, header=['Duplicate ACCT_CD'])

            # Access the workbook and worksheet objects
            workbook = writer.book
            worksheet_duplicates = writer.sheets['Duplicates']

            # Define a format to highlight duplicates in red
            red_format = workbook.add_format({'font_color': 'red', 'bg_color': '#FFC7CE'})

            # Apply red formatting to the 'ACCT_CD' column in the duplicates sheet
            worksheet_duplicates.conditional_format(
                f'A2:A{len(duplicate_values) + 1}',  # Highlight the duplicate ACCT_CD column
                {'type': 'formula', 'criteria': 'TRUE', 'format': red_format}
            )

        print(f"\nExcel file saved with duplicate values at {output_file}")
    else:
        print("\nNo duplicates found in 'ACCT_CD'.")
else:
    print("\n'ACCT_CD' column not found in the DataFrame.")



Analyzing 'ACCT_CD' column for duplicates...
Number of duplicate entries in 'ACCT_CD': 17

Duplicate values in 'ACCT_CD':
0           3024
1           3103
2           3157
3           3227
4            755
5            974
6     ns_ussegPA
7         p14605
8         p2060x
9         p2062x
10         p2764
11         p562x
12         p5751
13        p7496x
14        sma001
15        sma009
16        sma011
Name: ACCT_CD, dtype: object

Excel file saved with duplicate values at C:\Users\Neha\Desktop\Output.xlsx


In [3]:
if 'ACCT_CD' in df.columns:
    print("\nAnalyzing 'ACCT_CD' column for duplicates...")
    
    duplicate_mask = df['ACCT_CD'].duplicated(keep=False)
    duplicate_count = df['ACCT_CD'].duplicated().sum()
    print(f"Number of duplicate entries in 'ACCT_CD': {duplicate_count}")
    
    if duplicate_count > 0:
        # Get duplicate values
        duplicates = df[df['ACCT_CD'].duplicated(keep=False)]
        print("\nDuplicate rows in 'ACCT_CD':")
        print(duplicates)
        
        df_cleaned = df.drop_duplicates(subset='ACCT_CD', keep='first').reset_index(drop=True)
        print("\nDuplicates removed. Altered DataFrame:")
        print(df_cleaned.head())
    else:
        print("\nNo duplicates found in 'ACCT_CD'.")
else:
    print("\n'ACCT_CD' column not found in the DataFrame.")



Analyzing 'ACCT_CD' column for duplicates...
Number of duplicate entries in 'ACCT_CD': 17

Duplicate rows in 'ACCT_CD':
         ACCT_CD ACCT_NAME ACCT_SHT_NAME ACCT_TYP_CD PARENT_CHILD_FLAG  \
1263        3024      None          None        None              None   
1264        3024      None          None        None              None   
1296        3103      None          None        None              None   
1297        3103      None          None        None              None   
1313        3157      None          None        None              None   
1314        3157      None          None        None              None   
1350        3227      None          None        None              None   
1351        3227      None          None        None              None   
2462         755      None          None        None              None   
2463         755      None          None        None              None   
2731         974      None          None        None             

# 5. Changing relevant datatypes in SQL

In [7]:
import pandas as pd
def suggest_datatype(series):
    if pd.api.types.is_integer_dtype(series):
        return 'INTEGER'
    elif pd.api.types.is_float_dtype(series):
        return 'FLOAT'
    elif pd.api.types.is_datetime64_any_dtype(series):
        return 'DATETIME'
    elif pd.api.types.is_string_dtype(series):
        return 'VARCHAR(MAX)'
    else:
        return 'VARCHAR(MAX)'
filtered_columns = df_cleaned.columns.tolist()

data_types = {col: suggest_datatype(df_cleaned[col]) for col in filtered_columns}

print(data_types)


{'ACCT_CD': 'VARCHAR(MAX)', 'MANAGER': 'VARCHAR(MAX)', 'INACTIVE': 'VARCHAR(MAX)', 'AUTH_GROUP': 'VARCHAR(MAX)', 'AUTH_NUM': 'VARCHAR(MAX)', 'UDF_CHAR13': 'VARCHAR(MAX)'}


# 6. Creating Index

In [10]:
import pyodbc
import pandas as pd

server = 'DELL\\SQLEXPRESS'
database = 'NEHADB'

connection_string = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={database};Trusted_Connection=yes;"
conn = pyodbc.connect(connection_string)

cursor = conn.cursor()

# Query to dynamically get the primary key or unique column(s) for the table dbo.APXENRICHED_FUND
sql_get_index_column = """
SELECT c.name AS column_name
FROM sys.index_columns ic
JOIN sys.columns c ON ic.column_id = c.column_id AND ic.object_id = c.object_id
JOIN sys.indexes i ON ic.object_id = i.object_id AND ic.index_id = i.index_id
WHERE i.is_primary_key = 1 AND i.object_id = OBJECT_ID('dbo.APXENRICHED_FUND');
"""

cursor.execute(sql_get_index_column)
index_column = cursor.fetchone()

if index_column:
    index_column = index_column[0]  # Fetch the column name

    # Dynamically create an index if it does not exist
    sql_create_index = f"""
    IF NOT EXISTS (
        SELECT 1 
        FROM sys.indexes 
        WHERE object_id = OBJECT_ID('dbo.APXENRICHED_FUND') 
        AND name = 'Index_AF{index_column}'
    )
    BEGIN
        CREATE INDEX Index_AF{index_column}
        ON dbo.APXENRICHED_FUND({index_column});
    END
    """

    cursor.execute(sql_create_index)
    conn.commit()

    # Select the first 5 records
    sql_select_head = "SELECT TOP 5 * FROM dbo.APXENRICHED_FUND"
    data = pd.read_sql(sql_select_head, conn)

    # Reset index and rename the default index column
    data = data.reset_index().rename(columns={'index': 'Index'})
    print(data.head())
else:
    print("No primary key or unique index found for the table dbo.APXENRICHED_FUND.")

cursor.close()
conn.close()


   Index ACCT_CD   MANAGER INACTIVE     AUTH_GROUP AUTH_NUM UDF_CHAR13
0      0    006x    TM_DEV     True           None     None         ca
1      1    007x  IM_AA_PM     True  IM_TRADE_AUTH      1.0         ca
2      2    008x    TM_DEV     True           None     None         ca
3      3    015x    TM_DEV     True           None     None         ca
4      4    020x    TM_DEV     True           None     None         ca


  data = pd.read_sql(sql_select_head, conn)
