In [3]:
! pip install openpyxl




In [4]:
# LOADING DATA:
import pandas as pd

DATA=pd.read_csv(r"C:\Users\Nagesh Agrawal\OneDrive\Desktop\EDA\DATA\archive (1)\data_type.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Nagesh Agrawal\\OneDrive\\Desktop\\EDA\\DATA\\archive (1)\\data_type.csv'


## 🔢 **DATA TYPE PRIORITY SEQUENCE**

```plaintext
1️⃣ OBJECT  
2️⃣ NUMBER (INT, FLOAT)  
3️⃣ BOOLEAN  
```

---

## 💡 **DATA TYPE BREAKDOWN**

### 🧰 **1. OBJECT** (Highest Priority)

* **Description:** Mixed values or inconsistent types (strings, complex numbers, etc.)
* **Examples:**

  * `["A", "B", "C"]`
  * `[(1+1j), (3+4j), None]`
  * `[1, 2, "THREE"]`

---

### 🏷️ **2. CATEGORY**

* **Description:** Predefined fixed categories
* **Usage:** `pd.Categorical(["CAT", "DOG", "CAR"])`

---

### 🔘 **3. BOOL**

* **Description:** Boolean values only
* **Examples:** `[True, False, True]`

---

### 🔢 **4. INT64**

* **Description:** Whole numbers without `NaN`
* **Examples:** `[1, 2, 3]`

---

### 🔣 **5. FLOAT64**

* **Description:** Decimal numbers or contains `NaN`
* **Examples:** `[1.2, 2.2, 3.2, None]`

---

### 📅 **6. DATETIME64\[ns]**

* **Description:** Date & Time values
* **Usage:** `pd.to_datetime(["2024-05-07", "2024-05-08"])`

---

### ⏳ **7. TIMEDELTA64\[ns]**

* **Description:** Difference in time (duration)
* **Usage:** `pd.to_timedelta("1 days")`

---

✅ **TIP:**
Python chooses the **most general data type** if values are mixed. Convert types explicitly using `astype()` or pandas conversion functions like `pd.to_datetime()`, `pd.to_numeric()`.



#              IDENTIFYING COLUMN WITH IN VALID DATA TYPE:



In [None]:
DATA.head()

Unnamed: 0,Order_ID,Order_Date,Quantity,Discount,Sales,Postal_Code,Product_ID,Is_Returned,Customer_Age,Region
0,1001,2022/01/10,'5',0.1,500.75,'560001',P-001,Yes,'29',South
1,1002,'2022-01-12',3,'0.05','450.00',110001,P-002,No,34,North
2,1003,13-01-2022,'two',0,350,'700014',P-003,yes,45,East
3,1004,2022-01-15,4,0.2,four50,'500081',P-004,NO,'Thirty',West
4,1005,2022.01.18,3,,480.5,560002,P-005,Yes,28,South


In [None]:
DATA.tail()

Unnamed: 0,Order_ID,Order_Date,Quantity,Discount,Sales,Postal_Code,Product_ID,Is_Returned,Customer_Age,Region
10,1011,2022/01/27,seven,0.2,610.00,700016,P-011,Yes,40,East
11,1012,2022-01-29,3,0.05,490,'500083',P-012,NO,'Thirty-two',West
12,1013,2022/01/30,2,0.1,420.00,560004,P-013,yes,,South
13,1014,2022.01.31,'three',0.1,'410.00',110004,P-014,no,36,North
14,1015,'2022-02-01',4,0.05,five00,700017,P-015,Yes,'33',East


In [None]:
DATA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Order_ID      15 non-null     int64 
 1   Order_Date    15 non-null     object
 2   Quantity      15 non-null     object
 3   Discount      14 non-null     object
 4   Sales         14 non-null     object
 5   Postal_Code   15 non-null     object
 6   Product_ID    15 non-null     object
 7   Is_Returned   15 non-null     object
 8   Customer_Age  13 non-null     object
 9   Region        15 non-null     object
dtypes: int64(1), object(9)
memory usage: 1.3+ KB


In [None]:
'''
# Find rows where conversion failed (i.e., bad values)
bad_values = df[col][converted.isna() & df[col].notna()]
'''

In [None]:
for col in DATA.columns:
    print(f"\nValue counts for '{col}':\n{DATA[col].value_counts()}")



Value counts for 'Order_ID':
Order_ID
1001    1
1002    1
1003    1
1004    1
1005    1
1006    1
1007    1
1008    1
1009    1
1010    1
1011    1
1012    1
1013    1
1014    1
1015    1
Name: count, dtype: int64

Value counts for 'Order_Date':
Order_Date
2022/01/10      1
'2022-01-12'    1
13-01-2022      1
2022-01-15      1
2022.01.18      1
2022/01/20      1
2022-01-22      1
'2022/01/24'    1
2022-01-25      1
01-26-2022      1
2022/01/27      1
2022-01-29      1
2022/01/30      1
2022.01.31      1
'2022-02-01'    1
Name: count, dtype: int64

Value counts for 'Quantity':
Quantity
3          3
'5'        2
4          2
6          2
'two'      1
Four       1
5          1
seven      1
2          1
'three'    1
Name: count, dtype: int64

Value counts for 'Discount':
Discount
0.1       5
0.05      3
0.2       2
'0.05'    1
0         1
0.15      1
'zero'    1
Name: count, dtype: int64

Value counts for 'Sales':
Sales
500.75      1
'450.00'    1
350         1
four50      1
480.5       1

In [None]:
# WORKING ON Order_Date COLUMN:
# REMOVE ANY SURROUNDING SINGLE AND DOUBLE quotes :
DATA['Order_Date'] = DATA['Order_Date'].astype(str).str.replace("'", "").str.replace('"', '')
DATA["Order_Date"]=DATA["Order_Date"].astype(str).str.replace("/","-").str.replace(".","-")


# CORRECTING FORMAT:

def correct_format(x):
    parts = x.split('-')
    if len(parts) == 3 and len(parts[0]) == 2 and len(parts[1]) == 2 and len(parts[2]) == 4:
        return f"{parts[2]}-{parts[1]}-{parts[0]}"
    return x  # leave already correct formats untouched

DATA['Order_Date'] = DATA['Order_Date'].apply(correct_format)


In [None]:

DATA["Order_Date"].astype(str).str

<pandas.core.strings.accessor.StringMethods at 0x2a6ad82c620>

In [None]:
DATA

Unnamed: 0,Order_ID,Order_Date,Quantity,Discount,Sales,Postal_Code,Product_ID,Is_Returned,Customer_Age,Region
0,1001,2022-01-10,'5',0.1,500.75,'560001',P-001,Yes,'29',South
1,1002,2022-01-12,3,'0.05','450.00',110001,P-002,No,34,North
2,1003,2022-01-13,'two',0,350,'700014',P-003,yes,45,East
3,1004,2022-01-15,4,0.2,four50,'500081',P-004,NO,'Thirty',West
4,1005,2022-01-18,3,,480.5,560002,P-005,Yes,28,South
5,1006,2022-01-20,6,0.15,600,110002,P-006,no,32,North
6,1007,2022-01-22,'5',0.05,'500.00',700015,P-007,YES,,East
7,1008,2022-01-24,Four,0.1,550.25,500082,P-008,nO,37,West
8,1009,2022-01-25,5,'zero',475.00,'560003',P-009,Yes,31,South
9,1010,2022-26-01,6,0.1,,110003,P-010,No,'35',North


In [None]:
# CHANGING DATA TYPE :
DATA['Order_Date'] = pd.to_datetime(DATA['Order_Date'],errors="coerce")
# errors='coerce': converts invalid formats to NaT (missing datetime). for nan values 
# dayfirst=True: handles formats like 13-01-2022.

In [None]:
DATA

Unnamed: 0,Order_ID,Order_Date,Quantity,Discount,Sales,Postal_Code,Product_ID,Is_Returned,Customer_Age,Region
0,1001,2022-01-10,'5',0.1,500.75,'560001',P-001,Yes,'29',South
1,1002,2022-01-12,3,'0.05','450.00',110001,P-002,No,34,North
2,1003,2022-01-13,'two',0,350,'700014',P-003,yes,45,East
3,1004,2022-01-15,4,0.2,four50,'500081',P-004,NO,'Thirty',West
4,1005,2022-01-18,3,,480.5,560002,P-005,Yes,28,South
5,1006,2022-01-20,6,0.15,600,110002,P-006,no,32,North
6,1007,2022-01-22,'5',0.05,'500.00',700015,P-007,YES,,East
7,1008,2022-01-24,Four,0.1,550.25,500082,P-008,nO,37,West
8,1009,2022-01-25,5,'zero',475.00,'560003',P-009,Yes,31,South
9,1010,NaT,6,0.1,,110003,P-010,No,'35',North


In [None]:
# WORKING ON QUANTITIES:-
DATA["Quantity"].value_counts()

Quantity
3          3
'5'        2
4          2
6          2
'two'      1
Four       1
5          1
seven      1
2          1
'three'    1
Name: count, dtype: int64

In [None]:
DATA["Quantity"]=DATA["Quantity"].astype(str).str.replace("'","").str.replace('"','')

In [None]:
DATA["Quantity"].value_counts()

Quantity
5        3
3        3
4        2
6        2
two      1
Four     1
seven    1
2        1
three    1
Name: count, dtype: int64

In [None]:
! pip install word2number



In [None]:
'''import pandas as pd
from word2number import w2n

# Sample data
df = pd.DataFrame({'value': ['10', 'twenty', '30', 'forty five', 'error']})

# Create an empty list to store converted values
converted_values = []

# Loop through each value
for x in df['value']:
    try:
        converted_values.append(round(int(x), 0))
    except:
        try:
            converted_values.append(w2n.word_to_num(str(x).lower()))
        except:
            converted_values.append(x)  # or use np.nan for missing

# Assign the list back to the column (or a new column)
df['converted_value'] = converted_values

print(df)
'''

In [None]:
# Replace non-numeric text with word-to-number conversion
def convert_text_to_number(x):
    from word2number import w2n
    try:
        return round(int(x),0)
    except:
        try:
            return w2n.word_to_num(str(x).lower())
        except:
            return x  # or np.nan if using numpy

In [None]:
DATA["Quantity"]=DATA["Quantity"].apply(convert_text_to_number)

In [None]:
DATA

Unnamed: 0,Order_ID,Order_Date,Quantity,Discount,Sales,Postal_Code,Product_ID,Is_Returned,Customer_Age,Region
0,1001,2022-01-10,5,0.1,500.75,'560001',P-001,Yes,'29',South
1,1002,2022-01-12,3,'0.05','450.00',110001,P-002,No,34,North
2,1003,2022-01-13,2,0,350,'700014',P-003,yes,45,East
3,1004,2022-01-15,4,0.2,four50,'500081',P-004,NO,'Thirty',West
4,1005,2022-01-18,3,,480.5,560002,P-005,Yes,28,South
5,1006,2022-01-20,6,0.15,600,110002,P-006,no,32,North
6,1007,2022-01-22,5,0.05,'500.00',700015,P-007,YES,,East
7,1008,2022-01-24,4,0.1,550.25,500082,P-008,nO,37,West
8,1009,2022-01-25,5,'zero',475.00,'560003',P-009,Yes,31,South
9,1010,NaT,6,0.1,,110003,P-010,No,'35',North


In [None]:
DATA["Quantity"].value_counts()

Quantity
3    4
5    3
4    3
2    2
6    2
7    1
Name: count, dtype: int64

In [None]:
DATA["Quantity"]=pd.to_numeric(DATA["Quantity"])
#DATA["Quantity"]=DATA["Quantity"].astype(int)

In [None]:
DATA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Order_ID      15 non-null     int64         
 1   Order_Date    14 non-null     datetime64[ns]
 2   Quantity      15 non-null     int64         
 3   Discount      14 non-null     object        
 4   Sales         14 non-null     object        
 5   Postal_Code   15 non-null     object        
 6   Product_ID    15 non-null     object        
 7   Is_Returned   15 non-null     object        
 8   Customer_Age  13 non-null     object        
 9   Region        15 non-null     object        
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 1.3+ KB


In [None]:
# WORKING ON DISCOUNT:
DATA["Discount"].value_counts()

Discount
0.1       5
0.05      3
0.2       2
'0.05'    1
0         1
0.15      1
'zero'    1
Name: count, dtype: int64

In [None]:
DATA["Discount"]=DATA["Discount"].astype(str).str.replace("'","")

In [None]:
DATA["Discount"].value_counts()

Discount
0.1     5
0.05    4
0.2     2
0       1
nan     1
0.15    1
zero    1
Name: count, dtype: int64

In [None]:
DATA["Discount"]=DATA["Discount"].apply(convert_text_to_number)

In [None]:
DATA

Unnamed: 0,Order_ID,Order_Date,Quantity,Discount,Sales,Postal_Code,Product_ID,Is_Returned,Customer_Age,Region
0,1001,2022-01-10,5,0.1,500.75,'560001',P-001,Yes,'29',South
1,1002,2022-01-12,3,0.05,'450.00',110001,P-002,No,34,North
2,1003,2022-01-13,2,0.0,350,'700014',P-003,yes,45,East
3,1004,2022-01-15,4,0.2,four50,'500081',P-004,NO,'Thirty',West
4,1005,2022-01-18,3,,480.5,560002,P-005,Yes,28,South
5,1006,2022-01-20,6,0.15,600,110002,P-006,no,32,North
6,1007,2022-01-22,5,0.05,'500.00',700015,P-007,YES,,East
7,1008,2022-01-24,4,0.1,550.25,500082,P-008,nO,37,West
8,1009,2022-01-25,5,0.0,475.00,'560003',P-009,Yes,31,South
9,1010,NaT,6,0.1,,110003,P-010,No,'35',North


In [None]:
DATA["Discount"] = pd.to_numeric(DATA["Discount"],errors="coerce")

In [None]:
DATA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Order_ID      15 non-null     int64         
 1   Order_Date    14 non-null     datetime64[ns]
 2   Quantity      15 non-null     int64         
 3   Discount      14 non-null     float64       
 4   Sales         14 non-null     object        
 5   Postal_Code   15 non-null     object        
 6   Product_ID    15 non-null     object        
 7   Is_Returned   15 non-null     object        
 8   Customer_Age  13 non-null     object        
 9   Region        15 non-null     object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(6)
memory usage: 1.3+ KB


In [None]:
# WORKING ON SALES:
DATA["Sales"].value_counts()

Sales
500.75      1
'450.00'    1
350         1
four50      1
480.5       1
600         1
'500.00'    1
550.25      1
475.00      1
610.00      1
490         1
420.00      1
'410.00'    1
five00      1
Name: count, dtype: int64

In [None]:
DATA["Sales"]=DATA["Sales"].astype(str).str.replace("'","")

In [None]:
DATA

Unnamed: 0,Order_ID,Order_Date,Quantity,Discount,Sales,Postal_Code,Product_ID,Is_Returned,Customer_Age,Region
0,1001,2022-01-10,5,0.1,500.75,'560001',P-001,Yes,'29',South
1,1002,2022-01-12,3,0.05,450.00,110001,P-002,No,34,North
2,1003,2022-01-13,2,0.0,350,'700014',P-003,yes,45,East
3,1004,2022-01-15,4,0.2,four50,'500081',P-004,NO,'Thirty',West
4,1005,2022-01-18,3,,480.5,560002,P-005,Yes,28,South
5,1006,2022-01-20,6,0.15,600,110002,P-006,no,32,North
6,1007,2022-01-22,5,0.05,500.00,700015,P-007,YES,,East
7,1008,2022-01-24,4,0.1,550.25,500082,P-008,nO,37,West
8,1009,2022-01-25,5,0.0,475.00,'560003',P-009,Yes,31,South
9,1010,NaT,6,0.1,,110003,P-010,No,'35',North


In [None]:
DATA["Sales"]=DATA["Sales"].apply(convert_text_to_number)# its not working here 


In [None]:
DATA

Unnamed: 0,Order_ID,Order_Date,Quantity,Discount,Sales,Postal_Code,Product_ID,Is_Returned,Customer_Age,Region
0,1001,2022-01-10,5,0.1,500.75,'560001',P-001,Yes,'29',South
1,1002,2022-01-12,3,0.05,450.00,110001,P-002,No,34,North
2,1003,2022-01-13,2,0.0,350,'700014',P-003,yes,45,East
3,1004,2022-01-15,4,0.2,four50,'500081',P-004,NO,'Thirty',West
4,1005,2022-01-18,3,,480.5,560002,P-005,Yes,28,South
5,1006,2022-01-20,6,0.15,600,110002,P-006,no,32,North
6,1007,2022-01-22,5,0.05,500.00,700015,P-007,YES,,East
7,1008,2022-01-24,4,0.1,550.25,500082,P-008,nO,37,West
8,1009,2022-01-25,5,0.0,475.00,'560003',P-009,Yes,31,South
9,1010,NaT,6,0.1,,110003,P-010,No,'35',North


In [None]:
# Replace non-numeric text with word-to-number conversion
def convert_mixedtext_to_number(x):
    from word2number import w2n
    import re
    try:
        return round(float(x), 2)  # also works for strings like '450.00'
    except:
        try:
            # Extract word part and digit part from the string
            x_str = str(x).lower()
            word_part = re.sub(r'[^a-z]', '', x_str)
            digit_part = re.sub(r'[^0-9.]', '', x_str)

            word_num = w2n.word_to_num(word_part) if word_part else 0
            digit_num = float(digit_part) if digit_part else 0

            return round(word_num * (10 ** len(digit_part)) + digit_num, 2) if word_part else round(digit_num, 2)
        except:
            return x  # or np.nan if using numpy


In [None]:
DATA["Sales"]=DATA["Sales"].apply(convert_mixedtext_to_number)

In [None]:
DATA

Unnamed: 0,Order_ID,Order_Date,Quantity,Discount,Sales,Postal_Code,Product_ID,Is_Returned,Customer_Age,Region
0,1001,2022-01-10,5,0.1,500.75,'560001',P-001,Yes,'29',South
1,1002,2022-01-12,3,0.05,450.0,110001,P-002,No,34,North
2,1003,2022-01-13,2,0.0,350.0,'700014',P-003,yes,45,East
3,1004,2022-01-15,4,0.2,450.0,'500081',P-004,NO,'Thirty',West
4,1005,2022-01-18,3,,480.5,560002,P-005,Yes,28,South
5,1006,2022-01-20,6,0.15,600.0,110002,P-006,no,32,North
6,1007,2022-01-22,5,0.05,500.0,700015,P-007,YES,,East
7,1008,2022-01-24,4,0.1,550.25,500082,P-008,nO,37,West
8,1009,2022-01-25,5,0.0,475.0,'560003',P-009,Yes,31,South
9,1010,NaT,6,0.1,,110003,P-010,No,'35',North


In [None]:
DATA["Sales"]=pd.to_numeric(DATA["Sales"])

In [None]:
DATA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Order_ID      15 non-null     int64         
 1   Order_Date    14 non-null     datetime64[ns]
 2   Quantity      15 non-null     int64         
 3   Discount      14 non-null     float64       
 4   Sales         14 non-null     float64       
 5   Postal_Code   15 non-null     object        
 6   Product_ID    15 non-null     object        
 7   Is_Returned   15 non-null     object        
 8   Customer_Age  13 non-null     object        
 9   Region        15 non-null     object        
dtypes: datetime64[ns](1), float64(2), int64(2), object(5)
memory usage: 1.3+ KB


In [None]:
# WORKING ON POSTAL CODE":
DATA["Postal_Code"].value_counts()

Postal_Code
'560001'    1
110001      1
'700014'    1
'500081'    1
560002      1
110002      1
700015      1
500082      1
'560003'    1
110003      1
700016      1
'500083'    1
560004      1
110004      1
700017      1
Name: count, dtype: int64

In [None]:
DATA["Postal_Code"]=DATA["Postal_Code"].astype(str).str.replace("'","").str.replace('"','')

In [None]:
DATA["Postal_Code"]=pd.to_numeric(DATA["Postal_Code"])

In [None]:
DATA

Unnamed: 0,Order_ID,Order_Date,Quantity,Discount,Sales,Postal_Code,Product_ID,Is_Returned,Customer_Age,Region
0,1001,2022-01-10,5,0.1,500.75,560001,P-001,Yes,'29',South
1,1002,2022-01-12,3,0.05,450.0,110001,P-002,No,34,North
2,1003,2022-01-13,2,0.0,350.0,700014,P-003,yes,45,East
3,1004,2022-01-15,4,0.2,450.0,500081,P-004,NO,'Thirty',West
4,1005,2022-01-18,3,,480.5,560002,P-005,Yes,28,South
5,1006,2022-01-20,6,0.15,600.0,110002,P-006,no,32,North
6,1007,2022-01-22,5,0.05,500.0,700015,P-007,YES,,East
7,1008,2022-01-24,4,0.1,550.25,500082,P-008,nO,37,West
8,1009,2022-01-25,5,0.0,475.0,560003,P-009,Yes,31,South
9,1010,NaT,6,0.1,,110003,P-010,No,'35',North


In [None]:
# looking product id:
DATA["Product_ID"].value_counts()

Product_ID
P-001    1
P-002    1
P-003    1
P-004    1
P-005    1
P-006    1
P-007    1
P-008    1
P-009    1
P-010    1
P-011    1
P-012    1
P-013    1
P-014    1
P-015    1
Name: count, dtype: int64

In [None]:
# WORKING ON COUSTOMER AGE:
DATA["Customer_Age"].value_counts()

Customer_Age
'29'            1
34              1
45              1
'Thirty'        1
28              1
32              1
37              1
31              1
'35'            1
40              1
'Thirty-two'    1
36              1
'33'            1
Name: count, dtype: int64

In [None]:
DATA["Customer_Age"]=DATA["Customer_Age"].astype(str).str.replace("'","").str.replace('"','')

In [None]:
DATA["Customer_Age"]=DATA["Customer_Age"].apply(convert_text_to_number)

In [None]:
DATA["Customer_Age"]=pd.to_numeric(DATA["Customer_Age"],errors="coerce")

In [None]:
DATA

Unnamed: 0,Order_ID,Order_Date,Quantity,Discount,Sales,Postal_Code,Product_ID,Is_Returned,Customer_Age,Region
0,1001,2022-01-10,5,0.1,500.75,560001,P-001,Yes,29.0,South
1,1002,2022-01-12,3,0.05,450.0,110001,P-002,No,34.0,North
2,1003,2022-01-13,2,0.0,350.0,700014,P-003,yes,45.0,East
3,1004,2022-01-15,4,0.2,450.0,500081,P-004,NO,30.0,West
4,1005,2022-01-18,3,,480.5,560002,P-005,Yes,28.0,South
5,1006,2022-01-20,6,0.15,600.0,110002,P-006,no,32.0,North
6,1007,2022-01-22,5,0.05,500.0,700015,P-007,YES,,East
7,1008,2022-01-24,4,0.1,550.25,500082,P-008,nO,37.0,West
8,1009,2022-01-25,5,0.0,475.0,560003,P-009,Yes,31.0,South
9,1010,NaT,6,0.1,,110003,P-010,No,35.0,North


In [None]:
# DATA.to_csv(r"DATA\corrected_data_type.csv",index=False)

---
---

In [None]:
# LOADING DATASET:
import pandas as pd
DATA = pd.read_csv(r"C:\Users\Nagesh Agrawal\OneDrive\Desktop\6_MACHINE LEARNING\3__NATURAL LANGUAGE PROCESSING\NLP_DATASETS\NAMED ENTITY RECOGNITION DATA.csv")
DATA

Unnamed: 0,Headline,Content,News Categories,Date
0,Congress leader Baljinder Singh shot dead at h...,Congress leader Baljinder Singh was shot dead ...,['national'],19-09-2023
1,17-year-old girl preparing for NEET dies by su...,Another NEET aspirant died by suicide in Rajas...,['national'],19-09-2023
2,Hampers to welcome MPs in new Parliament tomor...,In order to mark the first-ever working day of...,['national'],19-09-2023
3,"Only 10% women lawmakers in RS, while only 14%...","Congress President Mallikarjun Kharge, while s...",['national'],19-09-2023
4,"Ganesh temple decorated with notes, coins wort...",The Sri Sathya Ganapathi Temple in Bengaluru a...,['national'],19-09-2023
...,...,...,...,...
307691,"Tamil Nadu to open 10,000 'CM's pharmacy store...",Tamil Nadu CM MK Stalin has announced that 'Ch...,"['national', 'Health___Fitness']",2024-08-26
307692,NMC study finds mental health issues prevalent...,One in four MBBS students has a mental disorde...,"['education', 'Health___Fitness', 'national']",2024-08-26
307693,Telangana CM says World Bank will help retire ...,Telangana CM Revanth Reddy said the World Bank...,"['politics', 'Health___Fitness', 'national']",2024-08-26
307694,Dr Gagandeep Kang explores role of AI in vacci...,"Dr Gagandeep Kang, a microbiologist and virolo...","['Health___Fitness', 'national']",2024-08-26


In [None]:
DATA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307696 entries, 0 to 307695
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Headline         307696 non-null  object
 1   Content          307696 non-null  object
 2   News Categories  307696 non-null  object
 3   Date             307696 non-null  object
dtypes: object(4)
memory usage: 9.4+ MB


In [None]:
DATA["Date"].unique()

In [None]:
def FIX_DATE(date_str):
    from datetime import datetime
    try:
        # If format is 'yyyy-mm-dd', convert to 'dd-mm-yyyy'
        d1 = datetime.strptime(date_str, '%Y-%m-%d')
        return d1.strftime('%d-%m-%Y')
    except:
        try:
            # If already 'dd-mm-yyyy', keep it
            datetime.strptime(date_str, '%d-%m-%Y')
            return date_str
        except:
            return 'Invalid'

In [None]:
DATA["Date"] = DATA["Date"].apply(FIX_DATE)

In [None]:
DATA["Date"].unique

In [None]:
DATA["Date"] = pd.to_datetime(DATA["Date"], format="%d-%m-%Y")

In [None]:
DATA.info()