In [3]:
import json
import os
import sqlite3
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
from pathlib import Path
from textwrap import dedent
from typing import Any, Dict, List, Tuple, Union

import pandas as pd
from crewai import Agent, Crew, Process, Task
from crewai_tools import tool
from langchain.schema import AgentFinish
from langchain.schema.output import LLMResult
from langchain_community.tools.sql_database.tool import (
    InfoSQLDatabaseTool,
    ListSQLDatabaseTool,
    QuerySQLCheckerTool,
    QuerySQLDataBaseTool,
)
import pickle
from langchain_community.utilities.sql_database import SQLDatabase
from langchain_core.callbacks.base import BaseCallbackHandler
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

In [None]:
user_api_key = ''
os.environ["OPENAI_API_KEY"] = user_api_key

In [45]:
db_host = "localhost"
db_name = "sam1"
db_user = "root"  # Replace with your actual username
db_password = "cronlabs"  # Replace with your actual password
db_port = "3306"

# MySQL connection URI
db_url = f'mysql+pymysql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}'
db = SQLDatabase.from_uri(db_url)

In [46]:
@dataclass
class Event:
    event: str
    timestamp: str
    text: str


def _current_time() -> str:
    return datetime.now(timezone.utc).isoformat()


class LLMCallbackHandler(BaseCallbackHandler):
    def __init__(self, log_path: Path):
        self.log_path = log_path

    def on_llm_start(
        self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
    ) -> Any:
        """Run when LLM starts running."""
        assert len(prompts) == 1
        event = Event(event="llm_start", timestamp=_current_time(), text=prompts[0])
        with self.log_path.open("a", encoding="utf-8") as file:
            file.write(json.dumps(asdict(event)) + "\n")

    def on_llm_end(self, response: LLMResult, **kwargs: Any) -> Any:
        """Run when LLM ends running."""
        generation = response.generations[-1][-1].message.content
        event = Event(event="llm_end", timestamp=_current_time(), text=generation)
        with self.log_path.open("a", encoding="utf-8") as file:
            file.write(json.dumps(asdict(event)) + "\n")
     

In [47]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)

In [48]:
@tool("list_tables")
def list_tables() -> str:
    """List the available tables in the database"""
    return ListSQLDatabaseTool(db=db).invoke("")

In [49]:
list_tables.run()

Using Tool: list_tables


'ce_dlb_new, ce_sales_new, mx_dlb_new, mx_sales_new'

In [50]:
@tool("tables_schema")
def tables_schema(tables: str) -> str:
    """
    Input is a comma-separated list of tables, output is the schema and sample rows
    for those tables. Be sure that the tables actually exist by calling `list_tables` first!
    Example Input: table1, table2, table3, table4
    """
    tool = InfoSQLDatabaseTool(db=db)
    return tool.invoke(tables)

In [51]:
@tool("execute_sql")
def execute_sql(sql_query: str) -> str:
    """Execute a SQL query against the database. Returns the result"""
    return QuerySQLDataBaseTool(db=db).invoke(sql_query)

In [52]:
@tool("check_sql")
def check_sql(sql_query: str) -> str:
    """
    Use this tool to double check if your query is correct before executing it. Always use this
    tool before executing a query with `execute_sql`.
    """
    return QuerySQLCheckerTool(db=db, llm=llm).invoke({"query": sql_query})

In [53]:
# sql_dev = Agent(
#     role="Precision SQL Analyst",
#     goal="Construct and execute highly accurate SQL queries, with a focus on flexible pattern matching for product models",
#     backstory=
#     """
#     You are an expert SQL Analyst known for your precision and adaptability, especially when dealing with product data across various industries. Your specialty lies in constructing queries that accurately capture all relevant data, particularly when dealing with product models that may have variations in naming conventions.
    
#     Core Competencies:
#     1. Expert Use of LIKE Operator: You excel in using the LIKE operator with wildcards to ensure comprehensive and accurate results when querying for specific product models. This skill is crucial for handling variations in model names, such as different storage capacities or color variants.
#     2. Rapid Schema Analysis: Quickly understand database structures using `list_tables` and `tables_schema`, identifying key columns for product models, sales data, and dates.
#     3. Dynamic Query Construction: Build SQL queries on-the-fly, adapting to user requirements and dataset peculiarities. You're adept at incorporating LIKE patterns for flexible model name matching.
#     4. Advanced Data Manipulation: Proficient in complex joins, subqueries, and aggregations, always considering how to best use LIKE for accurate product identification.
#     5. Query Optimization: Fine-tune queries for optimal performance, balancing the flexibility of LIKE statements with query efficiency.
#     6. Data Quality Assurance: Implement checks to handle inconsistencies in product naming conventions, ensuring all relevant data is captured.
    
#     Approach to Queries:
#     1. Requirement Analysis: Carefully interpret user requests, paying special attention to product model specifications and potential variations.
#     2. Schema Exploration: Use `list_tables` and `tables_schema` to identify relevant tables and columns, focusing on those containing product model information.
#     3. Query Design: Construct SQL queries with strategic use of LIKE operators. For example, use 'LIKE '%S23 Ultra%'' instead of exact matching to capture all relevant product variations.
#     4. Validation: Use the `check_sql` tool to verify query syntax and logic, ensuring LIKE statements are correctly formulated.
#     5. Execution and Review: Run the query with `execute_sql` and critically evaluate the results, checking if all expected product variants are included.
#     6. Iterative Refinement: Be prepared to adjust LIKE patterns based on initial results or additional user input, ensuring comprehensive data capture.
    
#     You excel at translating requests into precise SQL queries that don't miss any relevant data due to minor variations in product names or models. Your goal is to provide accurate, complete results that account for real-world complexities in product data.
#     Always remember:
#     1. Use LIKE with appropriate wildcards (%) for flexible model matching.
#     2. Consider common variations in model names (e.g., storage sizes, colors) when constructing LIKE patterns.
#     3. Validate results to ensure all relevant product data is captured.
#     4. Be prepared to refine LIKE patterns if initial results are incomplete.

#     Important: You are specifically programmed to only respond to greetings and questions related to Samsung's sales and market share data. If a query is not related to these topics, politely inform the user that you can only assist with Samsung sales and market share information, and ask them to rephrase their question accordingly. Do not attempt to answer questions outside of this scope.
#     You always display results in Indian currency units (Lakhs and Crores) for monetary values and in Thousands (K) for volume metrics.
    
#     """,

#     tools=[list_tables, tables_schema, execute_sql, check_sql],
#     allow_delegation=True,

    

    
# )

In [54]:
sql_dev = Agent(
    role="Precision SQL Analyst",
    goal="Construct and execute highly accurate SQL queries, with a focus on flexible pattern matching for product models",
    backstory=
    """
    You are an expert SQL Analyst known for your precision and adaptability, especially when dealing with product data across various industries. Your specialty lies in constructing queries that accurately capture all relevant data, particularly when dealing with product models that may have variations in naming conventions.

    Core Competencies:
    1. Expert Use of LIKE Operator: Excel in using LIKE with wildcards for comprehensive and accurate results, crucial for handling variations in model names.
    2. Rapid Schema Analysis: Quickly understand database structures using list_tables and tables_schema, identifying key columns for product models, sales data, and dates.
    3. Dynamic Query Construction: Build SQL queries on-the-fly, adapting to user requirements and dataset peculiarities, incorporating LIKE patterns for flexible model name matching.
    4. Advanced Data Manipulation: Proficient in complex joins, subqueries, and aggregations, always considering how to best use LIKE for accurate product identification.
    5. Query Optimization: Fine-tune queries for optimal performance, balancing LIKE statements' flexibility with query efficiency.
    6. Data Quality Assurance: Implement checks to handle inconsistencies in product naming conventions, ensuring all relevant data is captured.

    Approach to Queries:
    1. Requirement Analysis: Interpret user requests, focusing on product model specifications and potential variations.
    2. Schema Exploration: Use list_tables and tables_schema to identify relevant tables and columns, especially those with product model information.
    3. Query Design: Construct SQL queries with strategic use of LIKE operators (e.g., 'LIKE '%S23 Ultra%'' instead of exact matching).
    4. Validation: Use check_sql to verify query syntax and logic, ensuring correct LIKE statement formulation.
    5. Execution and Review: Run queries with execute_sql, evaluating results to check if all expected product variants are included.
    6. Iterative Refinement: Adjust LIKE patterns based on initial results or additional user input for comprehensive data capture.
    
    Additional Improvements:
    - **Clarify Variations in Model Names**: When using LIKE, always account for variations (e.g., different storage options, colors, minor model distinctions) and ensure they are captured accurately.
    - **Handle Missing Data**: If certain records or columns are not found, return a clear message indicating the issue (e.g., "No data found for this model in the specified period") instead of running a query that returns null or incorrect data.
    
    Important Columns in the Database:

    1. mx_dlb_new (Mobile Devices Market Share)
        - year_market_share: Year for market share data.
        - region: Geographical region (e.g., EAST, WEST).
        - competitor: Brand or competitor name (e.g., Samsung, Apple).
        - week_year: Specific week of the year.
        - month_market_share: Month of the market share data.
        - priceband: Price range of products (e.g., "30K - 40K").
        - salestype: Type of sales (e.g., Sales, Sellout).
        - channel: Sales channel (e.g., GT for General Trade).
        - model: Product model (e.g., WATCH4 44MM_LTE).
        - Product_Group: Type of product (e.g., Wearable, Smart Phone).
        - sellout_value: Total value of products sold.
        - sellout_volume: Units sold.

    2. mx_sales_new (Mobile Devices Sales Data)
        - salestype: Type of sales (e.g., Sell Out).
        - salescategory: Sales category (e.g., Normal, Promotional).
        - channel: Sales channel (e.g., B2B).
        - priceband: Price range of product sold.
        - region: Geographical region.
        - year, month, day: Date-related columns.
        - date: Exact transaction date.
        - weekyear: Week and year of sales.
        - segment: Product segment (e.g., Smart Phone, Wearable).
        - model: Product model.
        - quantity: Number of units sold.
        - amount: Total sales amount.

    3. ce_sales_new (Consumer Electronics Sales Data)
        - year_sale: Year of sale.
        - month_sale: Month of sale.
        - Sales Type: Type of sale (e.g., Sell Out).
        - salestype: (e.g., Union)
        - p0: Broad product category (e.g., DA for Digital Appliances, VD).
        - P1: Product type within the broad category (e.g., AC for Air Conditioners, UNMAP).
        - p2: Product variant or subtype (e.g., AIRPURIFIER, PANEL, DW etc.,).
        - p3: Specific model or further product classification (e.g., AIRPURI for specific air purifiers, DISHWASHER).
        - week_year_sale: Week and year of sale.
        - date_sale: Date of sale.
        - channel: Sales channel (e.g., MT for Modern Trade).
        - region: Geographical region.
        - quantity: Number of units sold.
        - amount: Total sales amount.

    4. ce_dlb_new (Consumer Electronics Market Share Data)
        - year_market_share: Year of market share data.
        - month_market_share: Month of market share data.
        - week_year: Week and year of market share data.
        - market_share_date: Date of market share data collection.
        - region: Geographical region.
        - channel: Sales channel (e.g., DD, MR).
        - competitor: Competitor brand.
        - p0: Broad product category (e.g., DA for Digital Appliances, VD).
        - p1: Product type within the broad category (e.g., AC for Air Conditioners, UNMAP).
        - p2: Product variant or subtype  (e.g., AC, SOUNDBAR, DISHWASHER).
        - p3: Specific model or further product classification.
        - sellout_volume: Units sold.
        - sellout_value: Total sales amount.

    Key Reminders:
    1. Use LIKE with appropriate wildcards (%) for flexible model matching.
    2. Consider common variations in model names (e.g., storage sizes, colors) when constructing LIKE patterns.
    3. When handling price band queries:
       - For exact price band matches, use the specific category (e.g., '10 K-15 K').
       - For price ranges spanning multiple categories:
         - Use OR conditions to include all relevant price bands.
         - Example: For "10K - 20K", include both '10 K-15 K' AND '15 K-20 K'.
    4. Validate results to ensure all relevant product data and price ranges are captured.
    5. Be prepared to refine LIKE patterns and price band logic if initial results are incomplete.

    Scope and Data Presentation:
    - Only respond to greetings and questions related to Samsung's sales and market share data.
    - Politely redirect off-topic queries, asking users to rephrase questions accordingly.
    - Display results in Indian currency units (Lakhs and Crores) for monetary values and in Thousands (K) for volume metrics.

    Database Structure:
    Four main tables: 'ce_dlb_new', 'ce_sales_new', 'mx_dlb_new', 'mx_sales_new' (covering consumer electronics and mobile devices for market share and sales data).

    Query Types:
    1. Sales Data: Use 'ce_sales_new' or 'mx_sales_new' with flexible model matching.
    2. Market Share Data: Use 'ce_dlb_new' or 'mx_dlb_new' for regional analysis.
    3. Date Ranges: Handle year, month, and week filters.
    4. Regional Filters: Filter by geographical regions (e.g., EAST, WEST, NORTH 2).
    5. Model Variations: Use LIKE for variations such as storage sizes and colors.

    Additional Information:
    - Apple 15 launch date: September 22, 2023 (relevant for market trend analysis).

    When constructing queries:
    - Always consider which join type is most appropriate for the specific analysis.
    - Use table aliases for clarity in complex joins (e.g., 'ce' for ce_sales_new).
    - Ensure join conditions accurately reflect the relationships between tables.
    - Use appropriate date functions (e.g., EXTRACT, DATE_TRUNC) for date-based analysis.
    - Implement proper grouping and aggregation for summary statistics.
    - Consider using CTEs for complex queries to improve readability and maintainability.
    """,

    tools=[list_tables, tables_schema, execute_sql, check_sql],
    allow_delegation=True,
    llm=llm,
)

In [55]:
# data_analyst = Agent(
#     role="Senior Data Analyst",
#     goal="You receive data from the database developer and analyze it",
#     backstory=dedent(
#         """
#         You have deep experience with analyzing datasets using Python.
#         Your work is always based on the provided data and is clear,
#         easy-to-understand and to the point. You have attention
#         to detail and always produce very detailed work (as long as you need).
#     """
#     ),
#     llm=llm,
#     allow_delegation=False,
# )

In [56]:

# report_writer = Agent(
#     role="Senior Report Editor",
#     goal="Write an executive summary type of report based on the work of the analyst",
#     backstory=dedent(
#         """
#         Your writing still is well known for clear and effective communication.
#         You always summarize long texts into bullet points that contain the most
#         important details.
#         """
#     ),
#     llm=llm,
#     allow_delegation=False,
# )
 

In [57]:
#Task
extract_data = Task(
    description="Extract data that is required for the query {query}.",
    expected_output="Database result for the query",
    agent=sql_dev,
)

In [58]:
# analyze_data = Task(
#     description="Analyze the data from the database and write an analysis for {query}.",
#     expected_output="Detailed analysis text",
#     agent=data_analyst,
#     context=[extract_data],
# )

In [59]:
# write_report = Task(
#     description=dedent(
#         """
#         Write an executive summary of the report from the analysis. The report
#         must be less than 100 words.
#     """
#     ),
#     expected_output="Markdown report",
#     agent=report_writer,
#     context=[analyze_data],
# )

In [60]:
crew = Crew(
    agents=[sql_dev],
    tasks=[extract_data],
    process=Process.sequential,
    verbose=1,
    memory=False,
    max_rpm=None,
    max_iter=25,
    output_log_file="crew.log",
)



In [61]:
result = crew.kickoff(inputs={"query": "total sales for each galaxy models in 2023"})

[1m[95m# Agent:[00m [1m[92mPrecision SQL Analyst[00m
[95m## Task:[00m [92mExtract data that is required for the query total sales for each galaxy models in 2023.[00m


[1m[95m# Agent:[00m [1m[92mPrecision SQL Analyst[00m
[95m## Thought:[00m [92mI need to extract data for total sales of each Galaxy model in 2023. First, I will check the available tables to identify where the sales data is stored.[00m
[95m## Using tool:[00m [92mlist_tables[00m
[95m## Tool Input:[00m [92m
"{}"[00m
[95m## Tool Output:[00m [92m
ce_dlb_new, ce_sales_new, mx_dlb_new, mx_sales_new[00m


[1m[95m# Agent:[00m [1m[92mPrecision SQL Analyst[00m
[95m## Thought:[00m [92mThought: I have identified the available tables. Since I need to extract sales data for Galaxy models, I will check the schema of the `mx_sales_new` table, as it is likely to contain mobile device sales data.[00m
[95m## Using tool:[00m [92mtables_schema[00m
[95m## Tool Input:[00m [92m
"{\"tables\": \"mx_

In [None]:
# import os
# import io
# import pandas as pd
# from openai import OpenAI

# # Set the OpenAI API key (replace with your actual key, which should be handled securely)
# user_api_key = ''
# os.environ["OPENAI_API_KEY"] = user_api_key

# # Initialize the OpenAI client
# client = OpenAI()

# def process_data(result):
#     prompt = f"""
#     Analyze the following data and convert it to CSV format:
#     {result}
#     Return ONLY the CSV data without any additional text or explanation.
#     Ensure that all numeric values are maintained in their correct format.
#     """
#     response = client.chat.completions.create(
#         model="gpt-3.5-turbo",
#         messages=[
#             {"role": "system", "content": "You are a helpful assistant that converts data to CSV format."},
#             {"role": "user", "content": prompt}
#         ],
#         temperature=0.0
#     )
#     return response.choices[0].message.content.strip()

# def handle_output(output, filename='output.csv'):
#     # Check if the output looks like CSV
#     if ',' in output and '\n' in output:
#         try:
#             # Attempt to convert to DataFrame
#             df = pd.read_csv(io.StringIO(output))
#             pd.set_option('display.max_rows', None)
#             pd.set_option('display.max_columns', None)
#             pd.set_option('display.width', None)
#             pd.set_option('display.max_colwidth', None)
#             print("Output converted to DataFrame:")
#             print(df)
            
#             # Save the DataFrame to a CSV file
#             df.to_csv(filename, index=False)
#             print(f"CSV file saved as {filename}")
            
#             return df
#         except Exception as e:
#             print(f"Error converting to DataFrame: {e}")
#             print("Returning output as is:")
#             return output
#     else:
#         print("Output is not in CSV format. Returning as is:")
#         return output

# # Example usage
# final_result = process_data(result)
# final_output = handle_output(final_result)
# print(final_output)

In [None]:
import os
import io
import pandas as pd
from openai import OpenAI

# Set the OpenAI API key (replace with your actual key, which should be handled securely)
user_api_key = ''
os.environ["OPENAI_API_KEY"] = user_api_key

# Initialize the OpenAI client
client = OpenAI()

def process_data_to_csv(result):
    prompt = f"""
    Analyze the following data and convert it to CSV format:
    {result}
    Return ONLY the CSV data without any additional text or explanation.
    Ensure that all numeric values are maintained in their correct format.
    """
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that converts data to CSV format."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.0
    )
    return response.choices[0].message.content.strip()

def extract_business_insights(data):
    prompt = f"""
    Based on the following CSV data, extract meaningful business insights. 
    Highlight trends, potential opportunities, and any notable patterns.
    {data}
    Provide your insights in bullet points format.
    """
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an expert data analyst who provides business insights."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.0
    )
    return response.choices[0].message.content.strip()

def handle_output(output, filename='output.csv'):
    # Check if the output looks like CSV
    if ',' in output and '\n' in output:
        try:
            # Attempt to convert to DataFrame
            df = pd.read_csv(io.StringIO(output))
            pd.set_option('display.max_rows', None)
            pd.set_option('display.max_columns', None)
            pd.set_option('display.width', None)
            pd.set_option('display.max_colwidth', None)
            print("Output converted to DataFrame:")
            print(df)
            
            # Save the DataFrame to a CSV file
            df.to_csv(filename, index=False)
            print(f"CSV file saved as {filename}")
            
            return df
        except Exception as e:
            print(f"Error converting to DataFrame: {e}")
            print("Returning output as is:")
            return output
    else:
        print("Output is not in CSV format. Returning as is:")
        return output

# Example usage
final_result = process_data_to_csv(result)
final_output = handle_output(final_result)

# If the output is successfully converted to CSV, extract business insights
if isinstance(final_output, pd.DataFrame):
    insights = extract_business_insights(final_output.to_csv(index=False))
    print("\nBusiness Insights:")
    print(insights)
else:
    print("Error in generating business insights due to non-CSV output.")


Output converted to DataFrame:
                           Model  Total Quantity  Total Amount
0       GALAXY M01 CORE (1/16GB)            6155  7.688462e+07
1                  GALAXY Z FLIP             275  7.368332e+07
2            GALAXY M01 (3/32GB)            6929  1.316749e+08
3               GALAXY TAB A 7.0             863  2.160949e+07
4            GALAXY M11 (3/32GB)            8548  2.163684e+08
5           GALAXY A31 (6/128GB)            1951  7.515922e+07
6           GALAXY A71 (8/128GB)             864  5.705363e+07
7       GALAXY M01 CORE (2/32GB)            6957  1.055884e+08
8                GALAXY FIT LITE            8411  5.118444e+07
9                    GALAXY FIT2            1175  1.133439e+07
10                   GALAXY STAR              40  5.204100e+05
11                  GALAXY BOOK2           12426  2.269363e+09
12          GALAXY BOOK2 PRO 360            4732  1.469864e+09
13      GALAXY Z FOLD 3 (256 GB)            9517  3.535746e+09
14      GALAXY Z FOLD 3 

In [42]:
df = pd.read_csv('output.csv')
df

Unnamed: 0,Model,Total Quantity,Total Amount
0,GALAXY M01 CORE (1/16GB),6155,76884620.0
1,GALAXY Z FLIP,275,73683320.0
2,GALAXY M01 (3/32GB),6929,131674900.0
3,GALAXY TAB A 7.0,863,21609490.0
4,GALAXY M11 (3/32GB),8548,216368400.0
5,GALAXY A31 (6/128GB),1951,75159220.0
6,GALAXY A71 (8/128GB),864,57053630.0
7,GALAXY M01 CORE (2/32GB),6957,105588400.0
8,GALAXY FIT LITE,8411,51184440.0
9,GALAXY FIT2,1175,11334390.0


In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv(r'F:\hirthickkesh\sam\mx_sales_new.csv')
df2 = pd.read_csv(r'F:\hirthickkesh\sam\mx_dlb_new.csv')
df3 = pd.read_csv(r'F:\hirthickkesh\sam\ce_sales_new.csv')
df4 = pd.read_csv(r'F:\hirthickkesh\sam\ce_dlb_new.csv')

In [5]:
df1.head(2)

Unnamed: 0,salestype,salescategory,channel,priceband,region,year,month,day,date,weekyear,segment,model,quantity,amount
0,Sell Out,Normal,B2B,< 6 K,B2b,2024,2,7,2024-02-07,24.06W,Smart Phone,A2 CORE,4,20744.0
1,Sell Out,Normal,B2B,< 6 K,B2b,2024,2,16,2024-02-16,24.07W,Smart Phone,A2 CORE,8,165952.0


In [21]:
df1['region'].value_counts()

region
ONLINE     55748
MT         49138
NORTH 1    42884
SOUTH 1    42761
EAST 1     41927
WEST 2     41879
NORTH 2    41601
WEST 1     41285
NORTH 3    40505
WEST 3     39403
NORTH 4    38931
SOUTH 3    38719
EAST 2     36945
SOUTH 2    35638
WEST 4     35147
B2B          997
Name: count, dtype: int64

In [11]:

# Capitalize the 'model' and 'region' columns in both datasets
df1['model'] = df1['model'].str.upper()
df1['region'] = df1['region'].str.upper()

df2['model'] = df2['model'].str.upper()
df2['region'] = df2['region'].str.upper()

In [14]:
df1.shape

(623508, 14)

In [12]:


# Merge the tables based on 'region' and 'model'
merged_df_capitalized = pd.merge(df1, df2, how='left', left_on=['region', 'model'], right_on=['region', 'model'])




In [16]:
df1['area'].unique()

array(['B2B', 'GT', 'MT', nan, 'ONLINE'], dtype=object)

In [17]:
df2['channel'].unique()

array(['GT'], dtype=object)

In [18]:

# Check if there is a column named 'area' in the merged dataframe
if 'region' in merged_df_capitalized.columns:
    # Group by the 'area' column and check for null values in other columns
    null_values_by_area = merged_df_capitalized.groupby('region').apply(lambda x: x.isnull().sum())

    
else:
    null_values_by_area = "The column 'area' does not exist in the dataset."


  null_values_by_area = merged_df_capitalized.groupby('region').apply(lambda x: x.isnull().sum())
